python source code of text

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from six import text_type
import jaconv
import six
import re
import unicodedata
from JapaneseTokenizer import init_logger
import logging
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'

if six.PY2:
    def u(str): return str.decode("utf-8")
    def b(str): return str
    pass
else: # python3
    def u(str): return str
    def b(str): return str.encode("utf-8")
    pass

try:
    import neologdn
    is_neologdn_valid = True
except:
    logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.")
    is_neologdn_valid = False

STRING_EXCEPTION = set([u('*')])


def denormalize_text(input_text):
    # type: (text_type)->text_type
    """* What you can do
    - It converts text into standard japanese writing way

    * Note
    - hankaku-katakana is to zenkaku-katakana
    - zenkaku-eisu is to hankaku-eisu
    """
    if input_text in STRING_EXCEPTION:
        return input_text
    else:
        return jaconv.z2h(input_text, kana=False, ascii=True, digit=True)


def normalize_text(input_text,
                   dictionary_mode='ipadic',
                   new_line_replaced='。',
                   is_replace_eos=True,
                   is_kana=True,
                   is_ascii=True,
                   is_digit=True):
    # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
    """* What you can do
    - It converts input-text into normalized-text which is good for tokenizer input.

    * Params
    - new_line_replaced: a string which replaces from \n string.
    """
    if is_replace_eos:
        without_new_line = input_text.replace('\n', new_line_replaced)
    else:
        without_new_line = new_line_replaced

    if dictionary_mode=='neologd' and is_neologdn_valid:
        return neologdn.normalize(normalize_text_normal_ipadic(without_new_line))
    elif dictionary_mode=='neologd' and is_neologdn_valid == False:
        raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
    else:
        return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)


def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):
    # type: (text_type,bool,bool,bool)->text_type
    """
    * All hankaku Katanaka is converted into Zenkaku Katakana
    * All hankaku English alphabet and numberc string are converted into Zenkaku one
    """
    return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)