Java Code Examples for java.text.BreakIterator#getSentenceInstance()

The following examples show how to use java.text.BreakIterator#getSentenceInstance() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimpleGroovyDoc.java    From groovy with Apache License 2.0 6 votes vote down vote up
public static String calculateFirstSentence(String raw) {
    // remove all the * from beginning of lines
    String text = raw.replaceAll("(?m)^\\s*\\*", "").trim();
    // assume a <p> paragraph tag signifies end of sentence
    text = text.replaceFirst("(?ms)<p>.*", "").trim();
    // assume completely blank line signifies end of sentence
    text = text.replaceFirst("(?ms)\\n\\s*\\n.*", "").trim();
    // assume @tag signifies end of sentence
    text = text.replaceFirst("(?ms)\\n\\s*@(see|param|throws|return|author|since|exception|version|deprecated|todo)\\s.*", "").trim();
    // Comment Summary using first sentence (Locale sensitive)
    BreakIterator boundary = BreakIterator.getSentenceInstance(Locale.getDefault()); // todo - allow locale to be passed in
    boundary.setText(text);
    int start = boundary.first();
    int end = boundary.next();
    if (start > -1 && end > -1) {
        // need to abbreviate this comment for the summary
        text = text.substring(start, end);
    }
    return text;
}
 
Example 2
Source File: DocCommentParser.java    From hottub with GNU General Public License v2.0 6 votes vote down vote up
DocCommentParser(ParserFactory fac, DiagnosticSource diagSource, Comment comment) {
    this.fac = fac;
    this.diagSource = diagSource;
    this.comment = comment;
    names = fac.names;
    m = fac.docTreeMaker;

    Locale locale = (fac.locale == null) ? Locale.getDefault() : fac.locale;

    Options options = fac.options;
    boolean useBreakIterator = options.isSet("breakIterator");
    if (useBreakIterator || !locale.getLanguage().equals(Locale.ENGLISH.getLanguage()))
        sentenceBreaker = BreakIterator.getSentenceInstance(locale);

    initTagParsers();
}
 
Example 3
Source File: ImportExportServiceImpl.java    From webanno with Apache License 2.0 6 votes vote down vote up
public static void splitSentences(CAS aCas)
{
    BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
    bi.setText(aCas.getDocumentText());
    int last = bi.first();
    int cur = bi.next();
    while (cur != BreakIterator.DONE) {
        int[] span = new int[] { last, cur };
        trim(aCas.getDocumentText(), span);
        if (!isEmpty(span[0], span[1])) {
            aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
        }
        last = cur;
        cur = bi.next();
    }
}
 
Example 4
Source File: AccessibleHTML.java    From Bytecoder with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 5
Source File: DocLocale.java    From openjdk-jdk8u with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Constructor
 */
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
    this.docenv = docenv;
    this.localeName = localeName;
    this.useBreakIterator = useBreakIterator;
    locale = getLocale();
    if (locale == null) {
        docenv.exit();
    } else {
        Locale.setDefault(locale); // NOTE: updating global state
    }
    collator = Collator.getInstance(locale);
    sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}
 
Example 6
Source File: AccessibleHTML.java    From hottub with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 7
Source File: TestCustomSeparatorBreakIterator.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSliceEnd() throws Exception {
  BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
  BreakIterator actual = new CustomSeparatorBreakIterator(randomSeparator());
  assertSameBreaks("a000", 0, 1, expected, actual);
  assertSameBreaks("ab000", 0, 1, expected, actual);
  assertSameBreaks("abc000", 0, 1, expected, actual);
  assertSameBreaks("000", 0, 0, expected, actual);
}
 
Example 8
Source File: LocalizedBundleInfo.java    From netbeans with Apache License 2.0 5 votes vote down vote up
private static String[] splitBySentence(String text) {
    List<String> sentences = new ArrayList<String>();
    // Use Locale.US since the customizer is setting the default (US) locale text only:
    BreakIterator it = BreakIterator.getSentenceInstance(Locale.US);
    it.setText(text);
    int start = it.first();
    int end;
    while ((end = it.next()) != BreakIterator.DONE) {
        sentences.add(text.substring(start, end));
        start = end;
    }
    return sentences.toArray(new String[sentences.size()]);
}
 
Example 9
Source File: AccessibleHTML.java    From JDKSourceCode1.8 with MIT License 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 10
Source File: AccessibleHTML.java    From jdk8u_jdk with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 11
Source File: AccessibleHTML.java    From jdk8u-jdk with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 12
Source File: AccessibleHTML.java    From jdk8u-dev-jdk with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Returns the Segment at <code>index</code> representing either
 * the paragraph or sentence as identified by <code>part</code>, or
 * null if a valid paragraph/sentence can't be found. The offset
 * will point to the start of the word/sentence in the array, and
 * the modelOffset will point to the location of the word/sentence
 * in the model.
 */
private IndexedSegment getSegmentAt(int part, int index)
    throws BadLocationException {

    IndexedSegment seg = getParagraphElementText(index);
    if (seg == null) {
        return null;
    }
    BreakIterator iterator;
    switch (part) {
    case AccessibleText.WORD:
        iterator = BreakIterator.getWordInstance(getLocale());
        break;
    case AccessibleText.SENTENCE:
        iterator = BreakIterator.getSentenceInstance(getLocale());
        break;
    default:
        return null;
    }
    seg.first();
    iterator.setText(seg);
    int end = iterator.following(index - seg.modelOffset + seg.offset);
    if (end == BreakIterator.DONE) {
        return null;
    }
    if (end > seg.offset + seg.count) {
        return null;
    }
    int begin = iterator.previous();
    if (begin == BreakIterator.DONE ||
        begin >= seg.offset + seg.count) {
        return null;
    }
    seg.modelOffset = seg.modelOffset + begin - seg.offset;
    seg.offset = begin;
    seg.count = end - begin;
    return seg;
}
 
Example 13
Source File: SpellCheckIterator.java    From xds-ide with Eclipse Public License 1.0 5 votes vote down vote up
/**
 * Creates a new spell check iterator.
 *	
 * @param document the document containing the specified partition
 * @param region the region to spell check
 * @param locale the locale to use for spell checking
 * @param breakIterator the break-iterator
 */
public SpellCheckIterator(IDocument document, IRegion region, Locale locale, BreakIterator breakIterator) {
	fOffset= region.getOffset();
	fWordIterator= breakIterator;
	fDelimiter= TextUtilities.getDefaultLineDelimiter(document);

	String content;
	try {

		content= document.get(region.getOffset(), region.getLength());

	} catch (Exception exception) {
		content= ""; //$NON-NLS-1$
	}
	fContent= content;

	fWordIterator.setText(content);
	fPredecessor= fWordIterator.first();
	fSuccessor= fWordIterator.next();

	final BreakIterator iterator= BreakIterator.getSentenceInstance(locale);
	iterator.setText(content);

	int offset= iterator.current();
	while (offset != BreakIterator.DONE) {

		fSentenceBreaks.add(new Integer(offset));
		offset= iterator.next();
	}
}
 
Example 14
Source File: DocLocale.java    From TencentKona-8 with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Constructor
 */
DocLocale(DocEnv docenv, String localeName, boolean useBreakIterator) {
    this.docenv = docenv;
    this.localeName = localeName;
    this.useBreakIterator = useBreakIterator;
    locale = getLocale();
    if (locale == null) {
        docenv.exit();
    } else {
        Locale.setDefault(locale); // NOTE: updating global state
    }
    collator = Collator.getInstance(locale);
    sentenceBreaker = BreakIterator.getSentenceInstance(locale);
}
 
Example 15
Source File: BreakIteratorTest.java    From jdk8u_jdk with GNU General Public License v2.0 5 votes vote down vote up
public BreakIteratorTest()
{
    characterBreak = BreakIterator.getCharacterInstance();
    wordBreak = BreakIterator.getWordInstance();
    lineBreak = BreakIterator.getLineInstance();
    sentenceBreak = BreakIterator.getSentenceInstance();
}
 
Example 16
Source File: UnifiedSolrHighlighter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * parse a break iterator type for the specified locale
 */
protected BreakIterator parseBreakIterator(String type, Locale locale) {
  if (type == null || "SENTENCE".equals(type)) {
    return BreakIterator.getSentenceInstance(locale);
  } else if ("LINE".equals(type)) {
    return BreakIterator.getLineInstance(locale);
  } else if ("WORD".equals(type)) {
    return BreakIterator.getWordInstance(locale);
  } else if ("CHARACTER".equals(type)) {
    return BreakIterator.getCharacterInstance(locale);
  } else {
    throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
  }
}
 
Example 17
Source File: PropertiesDocReader.java    From jasperreports with GNU Lesser General Public License v3.0 4 votes vote down vote up
public void writeDefaultMessages()
{
	Properties defaultMessages = new Properties();
	BreakIterator sentenceBreaks = BreakIterator.getSentenceInstance(Locale.US);
	for (CompiledPropertyMetadata prop : properties.getProperties())
	{
		String descriptionMessage = PropertyMetadataConstants.PROPERTY_DESCRIPTION_PREFIX + prop.getName();
		if (propertyMessages == null || !propertyMessages.containsKey(descriptionMessage))
		{
			Element docNode = propertyDocNodes.get(prop.getName());
			if (docNode != null)
			{
				String docText = docNode.getTextContent();
				sentenceBreaks.setText(docText);
				int first = sentenceBreaks.first();
				int next = sentenceBreaks.next();
				
				String firstSentence = docText.substring(first, next);
				firstSentence = PATTERN_LEADING_WHITE_SPACE.matcher(firstSentence).replaceAll("");
				firstSentence = PATTERN_TRAILING_WHITE_SPACE.matcher(firstSentence).replaceAll("");
				
				defaultMessages.setProperty(descriptionMessage, firstSentence);
			}
		}
	}
	
	if (!defaultMessages.isEmpty())
	{
		try
		{
			FileObject res = environment.getFiler().createResource(StandardLocation.CLASS_OUTPUT, 
					"", properties.getMessagesName() + PropertyMetadataConstants.MESSAGES_DEFAULTS_SUFFIX, 
					(javax.lang.model.element.Element[]) null);
			try (OutputStream out = res.openOutputStream())
			{
				//TODO lucianc preserve order
				defaultMessages.store(out, null);
			}
		}
		catch (IOException e)
		{
			throw new RuntimeException(e);
		}
	}
}
 
Example 18
Source File: BreakIteratorTest.java    From TencentKona-8 with GNU General Public License v2.0 4 votes vote down vote up
public void TestSentenceInvariants()
{
    BreakIterator e = BreakIterator.getSentenceInstance();
    doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
}
 
Example 19
Source File: TestSegmentingTokenizerBase.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public SentenceAndWordTokenizer() {
  super(newAttributeFactory(), BreakIterator.getSentenceInstance(Locale.ROOT));
}
 
Example 20
Source File: BreakIteratorTest.java    From dragonwell8_jdk with GNU General Public License v2.0 4 votes vote down vote up
public void TestSentenceInvariants()
{
    BreakIterator e = BreakIterator.getSentenceInstance();
    doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
}