/* * This software was produced for the U. S. Government * under Contract No. W15P7T-11-C-F600, and is * subject to the Rights in Noncommercial Computer Software * and Noncommercial Computer Software Documentation * Clause 252.227-7014 (JUN 1995) * * Copyright 2013 The MITRE Corporation. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.tagger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Locale; import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.solr.common.SolrException; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.xml.sax.InputSource; public class XmlInterpolationTest extends TaggerTestCase { private static DocumentBuilder xmlDocBuilder; @BeforeClass public static void beforeClass() throws Exception { DocumentBuilderFactory xmlDocBuilderFactory = DocumentBuilderFactory.newInstance(); xmlDocBuilderFactory.setValidating(true); xmlDocBuilderFactory.setNamespaceAware(true); xmlDocBuilder = xmlDocBuilderFactory.newDocumentBuilder(); initCore("solrconfig-tagger.xml", "schema-tagger.xml"); } @AfterClass public static void cleanUpAfterClass() throws Exception { xmlDocBuilder = null; } @Override public void setUp() throws Exception { super.setUp(); baseParams.set("field", "name_tagXml"); baseParams.set("overlaps", "LONGEST_DOMINANT_RIGHT"); baseParams.set("xmlOffsetAdjust", "true"); } @Test public void test() throws Exception { buildNames("start end"); assertXmlTag("<doc>before start <!-- c --> end after</doc>", true); assertXmlTag("<doc>before start <br/> end after</doc>", true); assertXmlTag("<doc>before <em>start</em> <b>end</b> after</doc>", true); assertXmlTag("<doc>before <em>start</em> end after</doc>", true); assertXmlTag("<doc>before start end<em> after</em></doc>", true); assertXmlTag("<doc><em>before </em>start end after</doc>", true);//adjacent tags assertXmlTag("<doc>before <b> <em>start</em> </b> end after</doc>", true); assertXmlTag("<doc>before <b> <em>start</em> </b> <em> end </em> after</doc>", true); assertXmlTag("<doc><p>before start</p> end after</doc>", false); assertXmlTag("<doc>before start <p>end after</p> </doc>", false); assertXmlTag("<doc>before <em a='A' b='B'>start</em> <b a='A' b='B'>end</b> after</doc>", true); } @Test(expected = SolrException.class) public void testInvalidXml() throws Exception { assertXmlTag("notXml", false); } @Test(expected = Exception.class) public void testValidatingXml() throws Exception { validateXml("foo"); } protected void assertXmlTag(String docText, boolean expected) throws Exception { final SolrQueryRequest req = reqDoc(docText); try { // 5.4 and beyond we can use try-with-resources final SolrQueryResponse rsp = h.queryAndResponse(req.getParams().get("qt"), req); final TestTag[] testTags = pullTagsFromResponse(req, rsp); if (!expected) { assertEquals(0, testTags.length); } else { assertEquals(1, testTags.length); final TestTag tag = testTags[0]; validateXml(insertAnchorAtOffsets(docText, tag.startOffset, tag.endOffset, tag.docName)); } } finally { req.close(); } } protected void validateXml(String xml) throws Exception { // the "parse" method also validates XML, will throw an exception if mis-formatted xmlDocBuilder.parse(new InputSource(new StringReader(xml))); } @Test public void testLuceneHtmlFilterBehavior() { String docText; //Close tag adjacent to start & end results in end offset including the close tag. LUCENE-5734 docText = "<doc><a><b>start</b> end</a></doc>"; assertArrayEquals(tagExpect(docText, "start", "end</a>"), analyzeTagOne(docText, "start", "end")); //Space after "end" means offset doesn't include </a> docText = "<doc><a><b>start</b> end </a></doc>"; assertArrayEquals(tagExpect(docText, "start", "end"), analyzeTagOne(docText, "start", "end")); //Matches entity at end final String endStr = String.format(Locale.ROOT, "en&#x%02x;", (int) 'd'); docText = "<doc>start " + endStr + "</doc>"; assertArrayEquals(tagExpect(docText, "start", endStr), analyzeTagOne(docText, "start", "end")); //... and at start final String startStr = String.format(Locale.ROOT, "&#x%02x;tart", (int) 's'); docText = "<doc>" + startStr + " end</doc>"; assertArrayEquals(tagExpect(docText, startStr, "end"), analyzeTagOne(docText, "start", "end")); //Test ignoring proc instructions & comments. Note: doesn't expand the entity to "start". docText = "<!DOCTYPE start [ " + "<!ENTITY start \"start\">" + "]><start><?start start ?><!-- start --><start/>&start;</start>"; assertArrayEquals(new int[]{-1, -1}, analyzeTagOne(docText, "start", "start")); //Test entity behavior docText = " — – & &foo;   a b"; assertArrayEquals(new String[]{"—", "–", "&", "&foo;", "\u00A0", "a", "b"}, analyzeReturnTokens(docText)); //Observe offset adjustment of trailing entity to end tag docText = "foo bar"; assertArrayEquals(tagExpect(docText, "foo", "foo"), analyzeTagOne(docText, "foo", "foo")); } private String insertAnchorAtOffsets(String docText, int startOffset, int endOffset, String id) { String insertStart = "<A id='"+ id +"'>";// (normally we'd escape id) String insertEnd = "</A>"; return docText.substring(0, startOffset) + insertStart + docText.substring(startOffset, endOffset) + insertEnd + docText.substring(endOffset); } private int[] tagExpect(String docText, String start, String end) { return new int[]{docText.indexOf(start), docText.indexOf(end) + end.length()}; } private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; } private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); } }