java source code of UrlTitleAnnouncer

/**
 * Copyright (C) 2013 Oldterns
 *
 * This file may be modified and distributed under the terms
 * of the MIT license. See the LICENSE file for details.
 */
package com.oldterns.vilebot.handlers.user;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.pircbotx.hooks.ListenerAdapter;
import org.pircbotx.hooks.types.GenericMessageEvent;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Will find the HTML title of HTTP(S) pages from certain domains. Generally it's only worth adding trustworthy domains
 * that don't include titles in the URL for SEO.
 */

public class UrlTitleAnnouncer
    extends ListenerAdapter
{

    private static final Pattern urlPattern =
        Pattern.compile( "((?:http|https)://(?:www.|)(?:(?:abstrusegoose|xkcd)\\.com|youtube\\.(?:com|ca)|youtu\\.be)[^ ]*)" );

    private static final Pattern titlePattern = Pattern.compile( "<title>(.*)</title>" );

    @Override
    public void onGenericMessage( final GenericMessageEvent event )
    {
        Matcher urlMatcher = urlPattern.matcher( event.getMessage() );

        if ( urlMatcher.find() )
        {
            String title = scrapeURLHTMLTitle( urlMatcher.group( 1 ) );
            event.respondWith( "'" + title + "'" );
        }
    }

    /**
     * Accesses the source of a HTML page and looks for a title element
     * 
     * @param url http URI String
     * @return String of text between the first <title> tag group on the page, empty if error.
     */
    private String scrapeURLHTMLTitle( String url )
    {
        String title = "";

        try
        {
            Document doc = Jsoup.connect( url ).get();
            title = doc.title();
        }
        catch ( IOException x )
        {
            System.err.format( "scrapeURLHTMLTitle BufferedReader error: %s%n", x );
        }

        return title;
    }

}