/** * Copyright (C) 2014 The SciGraph authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.scigraph.annotation; import static com.google.common.base.Joiner.on; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Strings.isNullOrEmpty; import static com.google.common.collect.Iterables.getFirst; import static com.google.common.collect.Iterables.getLast; import static com.google.common.collect.Iterables.transform; import static com.google.common.collect.Lists.newArrayList; import static com.google.common.collect.Maps.newHashMap; import static com.google.common.collect.Sets.newHashSet; import static java.lang.String.format; import io.scigraph.lucene.LuceneUtils; import java.io.IOException; import java.io.StringReader; import java.io.Writer; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.logging.Logger; import javax.inject.Inject; import net.htmlparser.jericho.Attribute; import net.htmlparser.jericho.EndTag; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Segment; import net.htmlparser.jericho.StartTag; import net.htmlparser.jericho.StartTagType; import net.htmlparser.jericho.StreamedSource; import org.apache.lucene.analysis.Analyzer; import com.google.common.base.Function; import com.google.common.collect.ForwardingMap; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; class EntityProcessorImpl implements EntityProcessor { private static final Logger logger = Logger.getLogger(EntityProcessorImpl.class.getName()); private final Analyzer analyzer; private final EntityRecognizer recognizer; @Inject protected EntityProcessorImpl(EntityRecognizer recognizer) { this.recognizer = recognizer; analyzer = new EntityAnalyzer(); } BlockingQueue<List<Token<String>>> startShingleProducer(String content) { BlockingQueue<List<Token<String>>> queue = new LinkedBlockingQueue<List<Token<String>>>(); ShingleProducer producer = new ShingleProducer(analyzer, new StringReader(content), queue); Thread t = new Thread(producer, "Shingle Producer Thread"); t.start(); return queue; } String combineTokens(List<Token<String>> tokens) { return on(' ').join(transform(tokens, new Function<Token<String>, String>() { @Override public String apply(Token<String> input) { return input.getToken(); } })); } protected List<EntityAnnotation> getAnnotations(String content, EntityFormatConfiguration config) throws InterruptedException { checkNotNull(content); BlockingQueue<List<Token<String>>> queue = startShingleProducer(content); List<EntityAnnotation> annotations = new ArrayList<>(); while (true) { List<Token<String>> tokens = queue.take(); if (tokens.equals(ShingleProducer.END_TOKEN)) { break; } if (LuceneUtils.isStopword(getFirst(tokens, null).getToken()) || LuceneUtils.isStopword(getLast(tokens).getToken())) { continue; } String candidate = combineTokens(tokens); if (candidate.length() < config.getMinLength()) { continue; } int start = tokens.get(0).getStart(); int end = tokens.get(tokens.size() - 1).getEnd(); for (Entity entity : recognizer.getEntities(candidate, config)) { annotations.add(new EntityAnnotation(entity, start, end)); } } List<EntityAnnotation> ret = newArrayList(); for (EntityAnnotationGroup group : getAnnotationGroups(annotations, config.isLongestOnly())) { ret.addAll(group); } Collections.sort(ret); return ret; } /*** * Convert a list of annotations into annotation groups * * @param annotationList * Annotations * @param longestOnly * If shorter entities from annotation groups should be removed * @return annotation groups */ static List<EntityAnnotationGroup> getAnnotationGroups(List<EntityAnnotation> annotationList, boolean longestOnly) { List<EntityAnnotationGroup> groups = new ArrayList<>(); Collections.sort(annotationList, Collections.reverseOrder()); PeekingIterator<EntityAnnotation> iter = Iterators.peekingIterator(annotationList.iterator()); while (iter.hasNext()) { EntityAnnotationGroup group = new EntityAnnotationGroup(); group.add(iter.next()); Set<Entity> entitiesInGroup = new HashSet<>(); while (iter.hasNext() && group.intersects(iter.peek())) { if (!entitiesInGroup.contains(iter.peek().getToken())) { entitiesInGroup.add(iter.peek().getToken()); group.add(iter.next()); } else { iter.next(); } } if (longestOnly) { // Remove any entries that aren't as long as the first one Iterator<EntityAnnotation> groupIter = group.iterator(); int longest = group.peek().length(); while (groupIter.hasNext()) { EntityAnnotation annot = groupIter.next(); if (annot.length() < longest) { groupIter.remove(); } } } groups.add(group); } return groups; } /*** * Add markup based on the annotations...<br /> * TODO: make the markup configurable so as not to limit ourselves to html * * @param content * @return A String with embedded markup * @throws IOException */ protected final String insertSpans(List<EntityAnnotation> annotationList, String content, EntityFormatConfiguration config) throws IOException { StringBuilder buffer = new StringBuilder(content); Set<String> terms = newHashSet(); Set<String> attrs = newHashSet(); Set<String> cssClasses = newHashSet(); for (EntityAnnotationGroup group : getAnnotationGroups(annotationList, config.isLongestOnly())) { cssClasses.clear(); attrs.clear(); terms.clear(); for (Entity entity : group.getAnnotations()) { String serialized = entity.serialize(); if (!isNullOrEmpty(serialized)) { terms.add(serialized); } cssClasses.add(recognizer.getCssClass()); } if (!terms.isEmpty()) { attrs.add(format("%s=\"%s\"", config.getDataAttrName(), on("|").join(terms))); } buffer.insert(group.getEnd(), "</span>"); buffer.insert(group.getStart(), format("<span class=\"%s\" %s>", on(" ").join(cssClasses), on(" ").join(attrs))); } return buffer.toString(); } /*** * @param url * @return The appropriate base s.t. relative URLs can resolve */ static String getBase(URL url) { StringBuilder buffer = new StringBuilder(); buffer.append(url.getProtocol()); buffer.append("://"); buffer.append(url.getHost()); if (url.getPort() > 0 && 80 != url.getPort()) { buffer.append(':'); buffer.append(url.getPort()); } if (!isNullOrEmpty(url.getPath()) && url.getPath().contains("/")) { String path = url.getPath(); buffer.append(path.substring(0, path.lastIndexOf("/"))); } buffer.append('/'); return buffer.toString(); } private static void injectStyles(Writer writer, List<String> styles) throws IOException { for (String style : styles) { writer .write(format("<link rel=\"stylesheet\" style=\"text/css\" href=\"%s\"></link>", style)); } } private static void injectScripts(Writer writer, List<String> scripts) throws IOException { for (String script : scripts) { writer.write(format( "<script type=\"text/javascript\" language=\"javascript\" src=\"%s\"></script>", script)); } } boolean shouldAnnotate(LinkedList<Element> eltStack, EntityFormatConfiguration config) { boolean shouldAnnotate = false; // If the user hasn't asked for anything - default to annotate everything if (config.getTargetClasses().isEmpty() && config.getTargetIds().isEmpty()) { shouldAnnotate = true; } for (Element tag : eltStack) { if (config.getIgnoreTags().contains(tag.getName().toLowerCase())) { // If an ignore tag is encountered in the stack stop... shouldAnnotate = false; break; } if (tag.containsKey("id") && config.getTargetIds().contains(tag.get("id"))) { shouldAnnotate = true; } if (tag.containsKey("class")) { for (String clazz : tag.get("class").split("\\s+")) { if (!clazz.isEmpty() && config.getTargetClasses().contains(clazz)) { shouldAnnotate = true; } } } } return shouldAnnotate; } @Override public List<EntityAnnotation> annotateEntities(EntityFormatConfiguration config) throws IOException { checkNotNull(config); try (StreamedSource source = new StreamedSource(config.getReader())) { LinkedList<Element> eltStack = new LinkedList<Element>(); List<EntityAnnotation> entities = newArrayList(); for (Segment segment : source) { if (segment instanceof StartTag) { StartTag tag = (StartTag) segment; config.getWriter().write(segment.toString()); if (tag.getTagType() == StartTagType.NORMAL && // Jericho is not generating end events for minimized element - don't // add them to the stack !tag.toString().endsWith("/>") && !tag.toString().endsWith("/ >")) { eltStack.push(new Element(tag.getName(), tag.getAttributes())); } if (config.getUrl() != null && HTMLElementName.HEAD.equals(tag.getName())) { config.getWriter().write("<base href=\"" + getBase(config.getUrl()) + "\"></base>"); injectStyles(config.getWriter(), config.getStylesheets()); injectScripts(config.getWriter(), config.getScripts()); } } else if (segment instanceof EndTag) { config.getWriter().write(segment.toString()); eltStack.pop(); } else if (segment.getClass().equals(Segment.class)) { if (shouldAnnotate(eltStack, config)) { try { List<EntityAnnotation> annotationList = getAnnotations(segment.toString(), config); entities.addAll(annotationList); config.getWriter().write(insertSpans(annotationList, segment.toString(), config)); } catch (IOException e) { config.getWriter().write(segment.toString()); logger.warning(e.getMessage()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } else { config.getWriter().write(segment.toString()); } } } List<EntityAnnotation> ret = newArrayList(); for (EntityAnnotationGroup group : getAnnotationGroups(entities, config.isLongestOnly())) { ret.addAll(group); } Collections.sort(ret); return ret; } } private static class Element extends ForwardingMap<String, String> { String name; Map<String, String> attributes = newHashMap(); Element(String name, Iterable<Attribute> attrs) { this.name = name; for (Attribute attr : attrs) { put(attr.getName(), attr.getValue()); } } String getName() { return name; } @Override public String toString() { return name + " {" + attributes + "}"; } @Override protected Map<String, String> delegate() { return attributes; } } }