package mtas.analysis.token; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; /** * The Class MtasToken. */ public abstract class MtasToken { /** The Constant log. */ private static final Log log = LogFactory.getLog(MtasToken.class); /** The Constant DELIMITER. */ public static final String DELIMITER = "\u0001"; /** The Constant regexpPrePostFix. */ public static final String regexpPrePostFix = "(.*)" + DELIMITER + "(.[^\u0000]*)"; /** The Constant patternPrePostFix. */ public static final Pattern patternPrePostFix = Pattern .compile(regexpPrePostFix); /** The token id. */ private Integer tokenId; /** The token ref. */ private Long tokenRef = null; /** The term ref. */ private Long termRef = null; /** The prefix id. */ private Integer prefixId = null; /** The token type. */ protected String tokenType = null; /** The token parent id. */ private Integer tokenParentId = null; /** The token value. */ private String tokenValue = null; /** The token position. */ private MtasPosition tokenPosition = null; /** The token offset. */ private MtasOffset tokenOffset = null; /** The token real offset. */ private MtasOffset tokenRealOffset = null; /** The token payload. */ private BytesRef tokenPayload = null; /** The provide offset. */ private Boolean provideOffset = true; /** The provide real offset. */ private Boolean provideRealOffset = true; /** The provide parent id. */ private Boolean provideParentId = true; /** * Instantiates a new mtas token. * * @param tokenId the token id * @param value the value */ protected MtasToken(Integer tokenId, String value) { this.tokenId = tokenId; setType(); setValue(value); } /** * Instantiates a new mtas token. * * @param tokenId the token id * @param prefix the prefix * @param postfix the postfix */ protected MtasToken(Integer tokenId, String prefix, String postfix) { Objects.requireNonNull(prefix, "prefix is obligatory"); this.tokenId = tokenId; setType(); if (postfix != null) { setValue(prefix + DELIMITER + postfix); } else { setValue(prefix + DELIMITER); } } /** * Instantiates a new mtas token. * * @param tokenId the token id * @param value the value * @param position the position */ protected MtasToken(Integer tokenId, String value, Integer position) { this(tokenId, value); addPosition(position); } /** * Instantiates a new mtas token. * * @param tokenId the token id * @param prefix the prefix * @param postfix the postfix * @param position the position */ protected MtasToken(Integer tokenId, String prefix, String postfix, Integer position) { this(tokenId, prefix, postfix); addPosition(position); } /** * Sets the token ref. * * @param ref the new token ref */ final public void setTokenRef(Long ref) { tokenRef = ref; } /** * Gets the token ref. * * @return the token ref */ final public Long getTokenRef() { return tokenRef; } /** * Sets the term ref. * * @param ref the new term ref */ final public void setTermRef(Long ref) { termRef = ref; } /** * Gets the term ref. * * @return the term ref */ final public Long getTermRef() { return termRef; } /** * Sets the prefix id. * * @param id the new prefix id */ final public void setPrefixId(int id) { prefixId = id; } /** * Gets the prefix id. * * @return the prefix id * @throws IOException Signals that an I/O exception has occurred. */ final public int getPrefixId() throws IOException { if (prefixId != null) { return prefixId; } else { throw new IOException("no prefixId"); } } /** * Sets the id. * * @param id the new id */ final public void setId(Integer id) { tokenId = id; } /** * Gets the id. * * @return the id */ final public Integer getId() { return tokenId; } /** * Sets the parent id. * * @param id the new parent id */ final public void setParentId(Integer id) { tokenParentId = id; } /** * Gets the parent id. * * @return the parent id */ final public Integer getParentId() { return tokenParentId; } /** * Sets the provide parent id. * * @param provide the new provide parent id */ final public void setProvideParentId(Boolean provide) { provideParentId = provide; } /** * Gets the provide parent id. * * @return the provide parent id */ final public boolean getProvideParentId() { return provideParentId; } /** * Sets the type. */ protected void setType() { throw new IllegalArgumentException("Type not implemented"); } /** * Gets the type. * * @return the type */ final public String getType() { return tokenType; } /** * Adds the position. * * @param position the position */ final public void addPosition(int position) { if (tokenPosition == null) { tokenPosition = new MtasPosition(position); } else { tokenPosition.add(position); } } /** * Adds the position range. * * @param start the start * @param end the end */ final public void addPositionRange(int start, int end) { if (tokenPosition == null) { tokenPosition = new MtasPosition(start, end); } else { int[] positions = new int[end - start + 1]; for (int i = start; i <= end; i++) { positions[i - start] = i; } tokenPosition.add(positions); } } /** * Adds the positions. * * @param positions the positions */ final public void addPositions(int[] positions) { if (positions != null && positions.length > 0) { if (tokenPosition == null) { tokenPosition = new MtasPosition(positions); } else { tokenPosition.add(positions); } } } /** * Adds the positions. * * @param list the list */ final public void addPositions(Set<Integer> list) { int[] positions = ArrayUtils .toPrimitive(list.toArray(new Integer[list.size()])); addPositions(positions); } /** * Check position type. * * @param type the type * @return the boolean */ final public Boolean checkPositionType(String type) { if (tokenPosition == null) { return false; } else { return tokenPosition.checkType(type); } } /** * Gets the position start. * * @return the position start */ final public Integer getPositionStart() { return tokenPosition == null ? null : tokenPosition.getStart(); } /** * Gets the position end. * * @return the position end */ final public Integer getPositionEnd() { return tokenPosition == null ? null : tokenPosition.getEnd(); } /** * Gets the position length. * * @return the position length */ final public Integer getPositionLength() { return tokenPosition == null ? null : tokenPosition.getLength(); } /** * Gets the positions. * * @return the positions */ final public int[] getPositions() { return tokenPosition == null ? null : tokenPosition.getPositions(); } /** * Check offset. * * @return the boolean */ final public Boolean checkOffset() { if ((tokenOffset == null) || !provideOffset) { return false; } else { return true; } } /** * Check real offset. * * @return the boolean */ final public Boolean checkRealOffset() { if ((tokenRealOffset == null) || !provideRealOffset) { return false; } else if (tokenOffset == null) { return true; } else if (tokenOffset.getStart() == tokenRealOffset.getStart() && tokenOffset.getEnd() == tokenRealOffset.getEnd()) { return false; } else { return true; } } /** * Sets the offset. * * @param start the start * @param end the end */ final public void setOffset(Integer start, Integer end) { if ((start == null) || (end == null)) { // do nothing } else if (start > end) { throw new IllegalArgumentException("Start offset after end offset"); } else { tokenOffset = new MtasOffset(start, end); } } /** * Adds the offset. * * @param start the start * @param end the end */ final public void addOffset(Integer start, Integer end) { if (tokenOffset == null) { setOffset(start, end); } else if ((start == null) || (end == null)) { // do nothing } else if (start > end) { throw new IllegalArgumentException("Start offset after end offset"); } else { tokenOffset.add(start, end); } } /** * Sets the provide offset. * * @param provide the new provide offset */ final public void setProvideOffset(Boolean provide) { provideOffset = provide; } /** * Sets the real offset. * * @param start the start * @param end the end */ final public void setRealOffset(Integer start, Integer end) { if ((start == null) || (end == null)) { // do nothing } else if (start > end) { throw new IllegalArgumentException( "Start real offset after end real offset"); } else { tokenRealOffset = new MtasOffset(start, end); } } /** * Sets the provide real offset. * * @param provide the new provide real offset */ final public void setProvideRealOffset(Boolean provide) { provideRealOffset = provide; } /** * Gets the provide offset. * * @return the provide offset */ final public boolean getProvideOffset() { return provideOffset; } /** * Gets the provide real offset. * * @return the provide real offset */ final public boolean getProvideRealOffset() { return provideRealOffset; } /** * Gets the offset start. * * @return the offset start */ final public Integer getOffsetStart() { return tokenOffset == null ? null : tokenOffset.getStart(); } /** * Gets the offset end. * * @return the offset end */ final public Integer getOffsetEnd() { return tokenOffset == null ? null : tokenOffset.getEnd(); } /** * Gets the real offset start. * * @return the real offset start */ final public Integer getRealOffsetStart() { return tokenRealOffset == null ? null : tokenRealOffset.getStart(); } /** * Gets the real offset end. * * @return the real offset end */ final public Integer getRealOffsetEnd() { return tokenRealOffset == null ? null : tokenRealOffset.getEnd(); } /** * Sets the value. * * @param value the new value */ public void setValue(String value) { tokenValue = value; } /** * Gets the prefix from value. * * @param value the value * @return the prefix from value */ public static String getPrefixFromValue(String value) { if (value == null) { return null; } else if (value.contains(DELIMITER)) { String[] list = value.split(DELIMITER); if (list != null && list.length > 0) { return list[0].replaceAll("\u0000", ""); } else { return null; } } else { return value.replaceAll("\u0000", ""); } } /** * Gets the postfix from value. * * @param value the value * @return the postfix from value */ public static String getPostfixFromValue(String value) { String postfix = ""; Matcher m = patternPrePostFix.matcher(value); if (m.find()) { postfix = m.group(2); } return postfix; } /** * Gets the postfix from value. * * @param term the term * @return the postfix from value */ public static String getPostfixFromValue(BytesRef term) { int i = term.offset; int length = term.offset + term.length; byte[] postfix = new byte[length]; while (i < length) { if ((term.bytes[i] & 0b10000000) == 0b00000000) { if (term.bytes[i] == 0b00000001) { i++; break; } else { i++; } } else if ((term.bytes[i] & 0b11100000) == 0b11000000) { i += 2; } else if ((term.bytes[i] & 0b11110000) == 0b11100000) { i += 3; } else if ((term.bytes[i] & 0b11111000) == 0b11110000) { i += 4; } else if ((term.bytes[i] & 0b11111100) == 0b11111000) { i += 5; } else if ((term.bytes[i] & 0b11111110) == 0b11111100) { i += 6; } else { return ""; } } int start = i; while (i < length) { if ((term.bytes[i] & 0b10000000) == 0b00000000) { if (term.bytes[i] == 0b00000000) { break; } postfix[i] = term.bytes[i]; i++; } else if ((term.bytes[i] & 0b11100000) == 0b11000000) { postfix[i] = term.bytes[i]; postfix[i + 1] = term.bytes[i + 1]; i += 2; } else if ((term.bytes[i] & 0b11110000) == 0b11100000) { postfix[i] = term.bytes[i]; postfix[i + 1] = term.bytes[i + 1]; postfix[i + 2] = term.bytes[i + 2]; i += 3; } else if ((term.bytes[i] & 0b11111000) == 0b11110000) { postfix[i] = term.bytes[i]; postfix[i + 1] = term.bytes[i + 1]; postfix[i + 2] = term.bytes[i + 2]; postfix[i + 3] = term.bytes[i + 3]; i += 4; } else if ((term.bytes[i] & 0b11111100) == 0b11111000) { postfix[i] = term.bytes[i]; postfix[i + 1] = term.bytes[i + 1]; postfix[i + 2] = term.bytes[i + 2]; postfix[i + 3] = term.bytes[i + 3]; postfix[i + 4] = term.bytes[i + 4]; i += 5; } else if ((term.bytes[i] & 0b11111110) == 0b11111100) { postfix[i] = term.bytes[i]; postfix[i + 1] = term.bytes[i + 1]; postfix[i + 2] = term.bytes[i + 2]; postfix[i + 3] = term.bytes[i + 3]; postfix[i + 4] = term.bytes[i + 4]; postfix[i + 5] = term.bytes[i + 5]; i += 6; } else { return ""; } } return new String(Arrays.copyOfRange(postfix, start, i), StandardCharsets.UTF_8); } /** * Gets the value. * * @return the value */ public String getValue() { return tokenValue; } /** * Gets the prefix. * * @return the prefix */ public String getPrefix() { return getPrefixFromValue(tokenValue); } /** * Gets the postfix. * * @return the postfix */ public String getPostfix() { return getPostfixFromValue(tokenValue); } /** * Check parent id. * * @return the boolean */ final public Boolean checkParentId() { if ((tokenParentId == null) || !provideParentId) { return false; } else { return true; } } /** * Check payload. * * @return the boolean */ final public Boolean checkPayload() { if (tokenPayload == null) { return false; } else { return true; } } /** * Sets the payload. * * @param payload the new payload */ public void setPayload(BytesRef payload) { tokenPayload = payload; } /** * Gets the payload. * * @return the payload */ public BytesRef getPayload() { return tokenPayload; } /** * Creates the automaton map. * * @param prefix the prefix * @param valueList the value list * @param filter the filter * @return the map */ public static Map<String, Automaton> createAutomatonMap(String prefix, List<String> valueList, Boolean filter) { HashMap<String, Automaton> automatonMap = new HashMap<>(); if (valueList != null) { for (String item : valueList) { if (filter) { item = item.replaceAll("([\\\"\\)\\(\\<\\>\\.\\@\\#\\]\\[\\{\\}])", "\\\\$1"); } automatonMap.put(item, new RegExp(prefix + MtasToken.DELIMITER + item + "\u0000*") .toAutomaton()); } } return automatonMap; } /** * Byte run automaton map. * * @param automatonMap the automaton map * @return the map */ public static Map<String, ByteRunAutomaton> byteRunAutomatonMap( Map<String, Automaton> automatonMap) { HashMap<String, ByteRunAutomaton> byteRunAutomatonMap = new HashMap<>(); if (automatonMap != null) { for (Entry<String, Automaton> entry : automatonMap.entrySet()) { byteRunAutomatonMap.put(entry.getKey(), new ByteRunAutomaton(entry.getValue())); } } return byteRunAutomatonMap; } /** * Creates the automata. * * @param prefix the prefix * @param regexp the regexp * @param automatonMap the automaton map * @return the list * @throws IOException Signals that an I/O exception has occurred. */ public static List<CompiledAutomaton> createAutomata(String prefix, String regexp, Map<String, Automaton> automatonMap) throws IOException { List<CompiledAutomaton> list = new ArrayList<>(); Automaton automatonRegexp = null; if (regexp != null) { RegExp re = new RegExp(prefix + MtasToken.DELIMITER + regexp + "\u0000*"); automatonRegexp = re.toAutomaton(); } int step = 500; List<String> keyList = new ArrayList<>(automatonMap.keySet()); for (int i = 0; i < keyList.size(); i += step) { int localStep = step; boolean success = false; CompiledAutomaton compiledAutomaton = null; while (!success) { success = true; int next = Math.min(keyList.size(), i + localStep); List<Automaton> listAutomaton = new ArrayList<>(); for (int j = i; j < next; j++) { listAutomaton.add(automatonMap.get(keyList.get(j))); } Automaton automatonList = Operations.union(listAutomaton); Automaton automaton; if (automatonRegexp != null) { automaton = Operations.intersection(automatonList, automatonRegexp); } else { automaton = automatonList; } try { compiledAutomaton = new CompiledAutomaton(automaton); } catch (TooComplexToDeterminizeException e) { log.debug(e); success = false; if (localStep > 1) { localStep /= 2; } else { throw new IOException("TooComplexToDeterminizeException"); } } } list.add(compiledAutomaton); } return list; } /* * (non-Javadoc) * * @see java.lang.Object#toString() */ @Override public String toString() { String text = ""; text += "[" + String.format("%05d", getId()) + "] "; text += ((getRealOffsetStart() == null) ? "[-------,-------]" : "[" + String.format("%07d", getRealOffsetStart()) + "-" + String.format("%07d", getRealOffsetEnd()) + "]"); text += (provideRealOffset ? " " : "* "); text += ((getOffsetStart() == null) ? "[-------,-------]" : "[" + String.format("%07d", getOffsetStart()) + "-" + String.format("%07d", getOffsetEnd()) + "]"); text += (provideOffset ? " " : "* "); if (getPositionLength() == null) { text += String.format("%11s", ""); } else if (getPositionStart().equals(getPositionEnd())) { text += String.format("%11s", "[" + getPositionStart() + "]"); } else if ((getPositions() == null) || (getPositions().length == (1 + getPositionEnd() - getPositionStart()))) { text += String.format("%11s", "[" + getPositionStart() + "-" + getPositionEnd() + "]"); } else { text += String.format("%11s", Arrays.toString(getPositions())); } text += ((getParentId() == null) ? "[-----]" : "[" + String.format("%05d", getParentId()) + "]"); text += (provideParentId ? " " : "* "); BytesRef payload = getPayload(); text += (payload == null) ? "[------] " : "[" + String .format("%.4f", PayloadHelper.decodeFloat(Arrays.copyOfRange(payload.bytes, payload.offset, (payload.offset + payload.length)))) + "] "; text += String.format("%25s", "[" + getPrefix() + "]") + " "; text += ((getPostfix() == null) ? "---" : "[" + getPostfix() + "]") + " "; return text; } }