java.text.BreakIterator#getSentenceInstance

Source File: SimpleGroovyDoc.java From groovy with Apache License 2.0

6 votes

public static String calculateFirstSentence(String raw) {
    // remove all the * from beginning of lines
    String text = raw.replaceAll("(?m)^\\s*\\*", "").trim();
    // assume a <p> paragraph tag signifies end of sentence
    text = text.replaceFirst("(?ms)<p>.*", "").trim();
    // assume completely blank line signifies end of sentence
    text = text.replaceFirst("(?ms)\\n\\s*\\n.*", "").trim();
    // assume @tag signifies end of sentence
    text = text.replaceFirst("(?ms)\\n\\s*@(see|param|throws|return|author|since|exception|version|deprecated|todo)\\s.*", "").trim();
    // Comment Summary using first sentence (Locale sensitive)
    BreakIterator boundary = BreakIterator.getSentenceInstance(Locale.getDefault()); // todo - allow locale to be passed in
    boundary.setText(text);
    int start = boundary.first();
    int end = boundary.next();
    if (start > -1 && end > -1) {
        // need to abbreviate this comment for the summary
        text = text.substring(start, end);
    }
    return text;
}

Source File: DocCommentParser.java From hottub with GNU General Public License v2.0

6 votes

DocCommentParser(ParserFactory fac, DiagnosticSource diagSource, Comment comment) {
    this.fac = fac;
    this.diagSource = diagSource;
    this.comment = comment;
    names = fac.names;
    m = fac.docTreeMaker;

    Locale locale = (fac.locale == null) ? Locale.getDefault() : fac.locale;

    Options options = fac.options;
    boolean useBreakIterator = options.isSet("breakIterator");
    if (useBreakIterator || !locale.getLanguage().equals(Locale.ENGLISH.getLanguage()))
        sentenceBreaker = BreakIterator.getSentenceInstance(locale);

    initTagParsers();
}

Source File: ImportExportServiceImpl.java From webanno with Apache License 2.0

6 votes

public static void splitSentences(CAS aCas)
{
    BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
    bi.setText(aCas.getDocumentText());
    int last = bi.first();
    int cur = bi.next();
    while (cur != BreakIterator.DONE) {
        int[] span = new int[] { last, cur };
        trim(aCas.getDocumentText(), span);
        if (!isEmpty(span[0], span[1])) {
            aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
        }
        last = cur;
        cur = bi.next();
    }
}

Source File: AccessibleHTML.java From Bytecoder with Apache License 2.0

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: DocLocale.java From openjdk-jdk8u with GNU General Public License v2.0

5 votes

/**
 * Constructor
 */
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
    this.docenv = docenv;
    this.localeName = localeName;
    this.useBreakIterator = useBreakIterator;
    locale = getLocale();
    if (locale == null) {
        docenv.exit();
    } else {
        Locale.setDefault(locale); // NOTE: updating global state
    }
    collator = Collator.getInstance(locale);
    sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}

Source File: AccessibleHTML.java From hottub with GNU General Public License v2.0

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: TestCustomSeparatorBreakIterator.java From lucene-solr with Apache License 2.0

5 votes

public void testSliceEnd() throws Exception {
  BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
  BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
  assertSameBreaks("a000", 0, 1, expected, actual);
  assertSameBreaks("ab000", 0, 1, expected, actual);
  assertSameBreaks("abc000", 0, 1, expected, actual);
  assertSameBreaks("000", 0, 0, expected, actual);
}

Source File: LocalizedBundleInfo.java From netbeans with Apache License 2.0

5 votes

private static String[] splitBySentence(String text) {
    List<String> sentences = new ArrayList<String>();
    // Use Locale.US since the customizer is setting the default (US) locale text only:
    BreakIterator it = BreakIterator.getSentenceInstance(Locale.US);
    it.setText(text);
    int start = it.first();
    int end;
    while ((end = it.next()) != BreakIterator.DONE) {
        sentences.add(text.substring(start, end));
        start = end;
    }
    return sentences.toArray(new String[sentences.size()]);
}

Source File: AccessibleHTML.java From JDKSourceCode1.8 with MIT License

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: AccessibleHTML.java From jdk8u_jdk with GNU General Public License v2.0

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: AccessibleHTML.java From jdk8u-jdk with GNU General Public License v2.0

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: AccessibleHTML.java From jdk8u-dev-jdk with GNU General Public License v2.0

5 votes

/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}

Source File: SpellCheckIterator.java From xds-ide with Eclipse Public License 1.0

5 votes

/**
 * Creates a new spell check iterator.
 *	
 * @param document the document containing the specified partition
 * @param region the region to spell check
 * @param locale the locale to use for spell checking
 * @param breakIterator the break-iterator
 */
public SpellCheckIterator(IDocument document, IRegion region, Locale locale, BreakIterator breakIterator) {
	fOffset= region.getOffset();
	fWordIterator= breakIterator;
	fDelimiter= TextUtilities.getDefaultLineDelimiter(document);

	String content;
	try {

		content= document.get(region.getOffset(), region.getLength());

	} catch (Exception exception) {
		content= ""; //$NON-NLS-1$
	}
	fContent= content;

	fWordIterator.setText(content);
	fPredecessor= fWordIterator.first();
	fSuccessor= fWordIterator.next();

	final BreakIterator iterator= BreakIterator.getSentenceInstance(locale);
	iterator.setText(content);

	int offset= iterator.current();
	while (offset != BreakIterator.DONE) {

		fSentenceBreaks.add(new Integer(offset));
		offset= iterator.next();
	}
}

Source File: DocLocale.java From TencentKona-8 with GNU General Public License v2.0

5 votes

/**
 * Constructor
 */
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
    this.docenv = docenv;
    this.localeName = localeName;
    this.useBreakIterator = useBreakIterator;
    locale = getLocale();
    if (locale == null) {
        docenv.exit();
    } else {
        Locale.setDefault(locale); // NOTE: updating global state
    }
    collator = Collator.getInstance(locale);
    sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}

Source File: BreakIteratorTest.java From jdk8u_jdk with GNU General Public License v2.0

5 votes

public BreakIteratorTest()
{
    characterBreak = BreakIterator.getCharacterInstance();
    wordBreak = BreakIterator.getWordInstance();
    lineBreak = BreakIterator.getLineInstance();
    sentenceBreak = BreakIterator.getSentenceInstance();
}

Source File: UnifiedSolrHighlighter.java From lucene-solr with Apache License 2.0

5 votes

/**
 * parse a break iterator type for the specified locale
 */
protected BreakIterator parseBreakIterator(String type, Locale locale) {
  if (type == null || "SENTENCE".equals(type)) {
    return BreakIterator.getSentenceInstance(locale);
  } else if ("LINE".equals(type)) {
    return BreakIterator.getLineInstance(locale);
  } else if ("WORD".equals(type)) {
    return BreakIterator.getWordInstance(locale);
  } else if ("CHARACTER".equals(type)) {
    return BreakIterator.getCharacterInstance(locale);
  } else {
    throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
  }
}

Source File: PropertiesDocReader.java From jasperreports with GNU Lesser General Public License v3.0

4 votes

public void writeDefaultMessages()
{
	Properties defaultMessages = new Properties();
	BreakIterator sentenceBreaks = BreakIterator.getSentenceInstance(Locale.US);
	for (CompiledPropertyMetadata prop : properties.getProperties())
	{
		String descriptionMessage = PropertyMetadataConstants.PROPERTY_DESCRIPTION_PREFIX + prop.getName();
		if (propertyMessages == null || !propertyMessages.containsKey(descriptionMessage))
		{
			Element docNode = propertyDocNodes.get(prop.getName());
			if (docNode != null)
			{
				String docText = docNode.getTextContent();
				sentenceBreaks.setText(docText);
				int first = sentenceBreaks.first();
				int next = sentenceBreaks.next();
				
				String firstSentence = docText.substring(first, next);
				firstSentence = PATTERN_LEADING_WHITE_SPACE.matcher(firstSentence).replaceAll("");
				firstSentence = PATTERN_TRAILING_WHITE_SPACE.matcher(firstSentence).replaceAll("");
				
				defaultMessages.setProperty(descriptionMessage, firstSentence);
			}
		}
	}
	
	if (!defaultMessages.isEmpty())
	{
		try
		{
			FileObject res = environment.getFiler().createResource(StandardLocation.CLASS_OUTPUT, 
					"", properties.getMessagesName() + PropertyMetadataConstants.MESSAGES_DEFAULTS_SUFFIX, 
					(javax.lang.model.element.Element[]) null);
			try (OutputStream out = res.openOutputStream())
			{
				//TODO lucianc preserve order
				defaultMessages.store(out, null);
			}
		}
		catch (IOException e)
		{
			throw new RuntimeException(e);
		}
	}
}

Source File: BreakIteratorTest.java From TencentKona-8 with GNU General Public License v2.0

4 votes

public void TestSentenceInvariants()
{
    BreakIterator e = BreakIterator.getSentenceInstance();
    doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
}

Source File: TestSegmentingTokenizerBase.java From lucene-solr with Apache License 2.0

4 votes

public SentenceAndWordTokenizer() {
  super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
}

Source File: BreakIteratorTest.java From dragonwell8_jdk with GNU General Public License v2.0

4 votes

public void TestSentenceInvariants()
{
    BreakIterator e = BreakIterator.getSentenceInstance();
    doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
}

Java Code Examples for java.text.BreakIterator#getSentenceInstance()