python source code of gen

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Install dependencies:
# sudo pip install pathlib2 Pillow pygame scikit-image

import os
import re
import codecs
import shutil
import random
import inspect
import glob
import pygame
import pygame.locals
import pygame.image
import pygame.freetype
import pygame.transform
import pygame.draw
import pygame.surfarray
from pygame import Color
import numpy as np
from PIL import Image, ImageOps
try:
    # The goal of pathlib2 is to provide a backport of standard pathlib module.
    # refers to https://pypi.org/project/pathlib2/
    from pathlib2 import Path
except Exception:
    # pathlib is new in version 3.4.
    # refers to https://docs.python.org/3/library/pathlib.html
    from pathlib import Path

# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
# The default encoding of Python 2 is 'ascii', Python 3 is 'utf-8'.
import sys
if sys.getdefaultencoding() != 'utf-8':
    reload(sys)
    sys.setdefaultencoding('utf-8')

# http://scikit-image.org/docs/dev/auto_examples/xx_applications/plot_morphology.html
import matplotlib.pyplot as plt
from skimage.morphology import erosion, dilation, opening, closing, white_tophat
from skimage.morphology import disk
from skimage import io


OUTPUT_DIR = os.path.join(str(Path.home()), "generated")
OUTPUT_NUM = 1000000  # shall be multiple of OUTPUT_BATCH
OUTPUT_BATCH = 1000
IMG_WIDTH = 200
IMG_HEIGHT = 32
MAX_LEFT_BLANK = 10
MIN_FONT_SIZE = int(IMG_HEIGHT / 3)
MAX_FONT_SIZE = IMG_HEIGHT - 4
MAX_TEXT_ANGLE = 0
MIN_TEXT_LEN = 2
MAX_TEXT_LEN = 10

mainDir = None
fontsChinese = None
chiPhrases = None


def initChineseSource(fp):
    global chiPhrases
    patt = re.compile('[\s,;!?，。；！？]+')
    txt = codecs.open(fp, 'r', encoding='UTF-8',
                      errors='ignore').read()
    print(txt[:1024])
    chiPhrases = re.split(patt, txt)


def putTextAndRotate(txt, fontName):
    '''
    Put text, rotate, and crop. Retures an img of size IMG_WIDTH * IMG_HEIGHT.
    There shall be some small pixel gap (1~3) with left/top/bottom border.
    '''
    SURF_WIDTH = 800
    SURF_HEIGHT = 600
    surf = pygame.Surface((800, 600))
    surf.fill(Color(255, 255, 255, 0))  # white
    # determine font size
    fontSize = random.randint(MIN_FONT_SIZE, MAX_FONT_SIZE)
    font = pygame.freetype.Font(fontName, fontSize)
    if MAX_TEXT_ANGLE != 0:
        font.rotation = random.randint(-MAX_TEXT_ANGLE, MAX_TEXT_ANGLE)
    print(
        'font: %s, fontSize: %s, rotation: %s, text: %s' % (os.path.basename(fontName), fontSize, font.rotation, txt))

    surfRect = surf.get_rect()
    txtRect = font.get_rect(txt)
    while len(txt) > 0 and (txtRect.width >= IMG_WIDTH or txtRect.height >= IMG_HEIGHT):
        print('txtRect %s is not covered by %s*%s' %
              (txtRect, IMG_HEIGHT, IMG_WIDTH))
        txt = txt[:-1]
        txtRect = font.get_rect(txt)
    if len(txt) == 0:
        print('txt become empty')
        return None, None
    # txtX = random.randint(0, surfRect.width - IMG_WIDTH)
    # txtY = random.randint(0, surfRect.height - IMG_HEIGHT)
    txtX = SURF_WIDTH / 2
    txtY = SURF_HEIGHT / 2
    txtRect2 = font.render_to(surf, (txtX, txtY), txt)
    # assert(txtRect == txtRect2)
    '''
    pygame.draw.rect(surf, Color(255, 0, 0, 0), pygame.Rect(
        txtX, txtY, txtRect.width, txtRect.height), 1)
    '''
    '''
    constrains:
    0<= gapX <= 3
    cropX + gapX = txtX
    gapX >= -txtRect.left
    gapX + txtRect.width <= IMG_WIDTH

    0 <= gapY1 <= 3
    0 <= gapY2 <= 3
    gapY1 = txtY - cropY
    gapY1 + gapY2 + txtRect.height <= IMG_HEIGHT
    '''
    gapX = random.randint(max(0, -txtRect.left),
                          min(IMG_WIDTH-txtRect.width, 3))
    gapY1 = random.randint(0, min(3, int((IMG_HEIGHT - txtRect.height)/2)))
    gapY2 = random.randint(0, min(3, int((IMG_HEIGHT - txtRect.height)/2)))
    cropX = txtX - gapX
    cropY = txtY - gapY1
    ratioX = IMG_WIDTH*1.0/(txtRect.width + gapX)
    ratioY = IMG_HEIGHT*1.0/(txtRect.height + gapY1 + gapY2)
    assert(ratioX >= 1.0)
    assert(ratioY >= 1.0)
    ratio = min(ratioX, ratioY)
    cropWidth = int(IMG_WIDTH*1.0/ratio)
    cropHeight = int(IMG_HEIGHT*1.0/ratio)
    cropRect = (cropX, cropY, cropWidth, cropHeight)
    print(
        'txtRect: %s, gapX: %s, gapY1: %s, gapY2: %s, cropRect: %s, ratioX: %s, ratioY: %s' % (txtRect, gapX, gapY1, gapY2, cropRect, ratioX, ratioY))
    surf = surf.subsurface(cropRect)
    surf = pygame.transform.scale(surf, (IMG_WIDTH, IMG_HEIGHT))
    return surf, txt


# http://www.xiaoliangbai.com/2016/09/09/more-on-image-noise-generation
# https://www.cnblogs.com/gongxijun/p/6114232.html
def noise_generator(noise_type, image):
    """
    Generate noise to a given Image based on required noise type

    Input parameters:
        image: ndarray (input image data. It will be converted to float)

        noise_type: string
            'gauss'        Gaussian-distrituion based noise
            'poission'     Poission-distribution based noise
            's&p'          Salt and Pepper noise, 0 or 1
            'speckle'      Multiplicative noise using out = image + n*image
                           where n is uniform noise with specified mean & variance
    """
    row, col, ch = image.shape
    if noise_type == "gauss":
        mean = 0.0
        var = 0.01
        sigma = var**0.5
        gauss = np.array(image.shape)
        gauss = np.random.normal(mean, sigma, (row, col, ch))
        gauss = gauss.reshape(row, col, ch)
        noisy = image + gauss
        return noisy.astype('uint8')
    elif noise_type == "s&p":
        s_vs_p = 0.5
        amount = 0.004
        out = image
        # Generate Salt '1' noise
        num_salt = np.ceil(amount * image.size * s_vs_p)
        coords = [np.random.randint(0, i - 1, int(num_salt))
                  for i in image.shape]
        out[coords] = 255
        # Generate Pepper '0' noise
        num_pepper = np.ceil(amount * image.size * (1. - s_vs_p))
        coords = [np.random.randint(0, i - 1, int(num_pepper))
                  for i in image.shape]
        out[coords] = 0
        return out
    elif noise_type == "poisson":
        vals = len(np.unique(image))
        vals = 2 ** np.ceil(np.log2(vals))
        noisy = np.random.poisson(image * vals) / float(vals)
        return noisy
    elif noise_type == "speckle":
        gauss = np.random.randn(row, col, ch)
        gauss = gauss.reshape(row, col, ch)
        noisy = image + image * gauss
        return noisy
    else:
        return image


def addNoiseAndGray(surf):
    # https://stackoverflow.com/questions/34673424/how-to-get-numpy-array-of-rgb-colors-from-pygame-surface
    imgdata = pygame.surfarray.array3d(surf)
    imgdata = imgdata.swapaxes(0, 1)
    # print('imgdata shape %s' % imgdata.shape)  # shall be IMG_HEIGHT * IMG_WIDTH
    imgdata2 = noise_generator('s&p', imgdata)

    img2 = Image.fromarray(np.uint8(imgdata2))
    # img2.save('/home/zhichyu/Downloads/2sp.jpg')
    grayscale2 = ImageOps.grayscale(img2)
    # grayscale2.save('/home/zhichyu/Downloads/2bw2.jpg')
    # return grayscale2

    array = np.asarray(np.uint8(grayscale2))
    # print('array.shape %s' % array.shape)
    selem = disk(random.randint(0, 1))
    eroded = erosion(array, selem)
    return eroded


def generateText(leng):
    text = ''
    font = random.choice(fontsChinese)
    while(1):
        phrase = random.choice(chiPhrases)
        if len(phrase) >= leng:
            maxStartPos = len(phrase)-leng
            allowedPos = list(filter(
                lambda pos: pos == 0 or ord(phrase[pos]) >= 256 or ord(phrase[pos-1]) >= 256, range(0, maxStartPos+1)))
            if len(allowedPos) == 0:
                continue
            i = random.choice(allowedPos)
            text = phrase[i:i+leng]
            break
    return font, text


def generate(total):
    global mainDir, fontsChinese
    assert(mainDir is not None)
    assert(fontsChinese is not None)
    num = 0
    while num < total:
        while 1:
            leng = random.randint(MIN_TEXT_LEN, MAX_TEXT_LEN)
            font, text = generateText(leng)
            surf, text = putTextAndRotate(text, font)
            if surf is None:
                continue
            img = addNoiseAndGray(surf)
            yield img, text
            num += 1
            break


def main():
    global mainDir, fontsChinese
    pygame.init()
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    os.makedirs(OUTPUT_DIR)
    labels = open(os.path.join(OUTPUT_DIR, "labels.txt"), 'w')
    labels.truncate()
    i = 0
    chiIdx = 0
    outDir = None

    # http://stackoverflow.com/questions/50499/how-do-i-get-the-path-and-name-of-the-file-that-is-currently-executing
    selfPath = os.path.abspath(inspect.getfile(inspect.currentframe()))
    mainDir, _ = os.path.split(selfPath)
    dirFonts = os.path.join(mainDir, 'fonts_Chinese')
    fnFonts = filter(lambda fn: os.path.splitext(fn)[1].lower() in [
                     '.ttf', '.otf'], os.listdir(dirFonts))
    fontsChinese = list(
        map(lambda fn: os.path.join(dirFonts, fn), fnFonts))

    chiFiles = sorted(glob.glob('newsgroup/corpus-*.txt'))
    outputPerChiFile = OUTPUT_NUM / len(chiFiles)
    initChineseSource(chiFiles[0])
    chiIdx += 1

    for im, text in generate(OUTPUT_NUM):
        if i % OUTPUT_BATCH == 0:
            outDir = os.path.join(OUTPUT_DIR, str(int(i/OUTPUT_BATCH)))
            os.makedirs(outDir)
        if i != 0 and i % outputPerChiFile == 0:
            initChineseSource(chiFiles[chiIdx])
            chiIdx += 1
        outf = os.path.join(outDir, '%s.jpg' % i)
        # pygame.image.save(im, outf) #pygame
        # im.save(outf) #PIL image
        io.imsave(outf, im)  # scikit-image
        labels.write('%s/%s.jpg\t%s\n' % (int(i/OUTPUT_BATCH),
                                          i, text))
        print('done %s.jpg, text: %s' % (i, text))
        i += 1
    labels.close()


if __name__ == '__main__':
    main()