java source code of TesseractOCRParser

CogStack-Pipeline-master
- src
  - acceptance-test
    - resources
      - elasticgazetteer_test.properties
    - java
      - uk
        ac
        kcl
        at
        ElasticGazetteerAcceptanceTest.java
    - .gradle
      - buildOutputCleanup
        built.bin
        cache.properties
        cache.properties.lock
  - main
    - resources
      - logback.xml
      - spring-slave.xml
      - PostgresSessionProperties.sql
      - spring-master.xml
      - datePatterns.txt
      - tika-config.xml
      - SqlServerSessionProperties.sql
    - main.iml
    - java
      - uk
        ac
        kcl
        database
        MapItemSqlParameterSourceProvider.java
        utils
        MatchingWindow.java
        LoggerHelper.java
        StringTools.java
        TcpHelper.java
        BatchJobUtils.java
        partitioners
        TimestampAndPKStepPartitioner.java
        StepPartitioner.java
        CogstackJobPartitioner.java
        rowmappers
        SimpleDocumentRowMapper.java
        DocumentRowMapper.java
        ColumnKeyRowMapper.java
        PartitionParamsRowMapper.java
        listeners
        NonFatalExceptionItemProcessorListener.java
        JobCompleteNotificationListener.java
        SkipListener.java
        service
        GateService.java
        ESRestService.java
        ElasticGazetteerService.java
        Main.java
        exception
        CogstackException.java
        GateProcessingFailedException.java
        DeIdentificationFailedException.java
        WebserviceProcessingFailedException.java
        scheduling
        SingleJobLauncher.java
        ScheduledJobLauncher.java
        cleanup
        CleanupBean.java
        itemWriters
        ElasticsearchDocumentWriter.java
        ElasticsearchRestDocumentWriter.java
        JSONFileItemWriter.java
        itemProcessors
        JSONMakerItemProcessor.java
        ThumbnailGenerationItemProcessor.java
        WebserviceDocumentItemProcessor.java
        DeIdDocumentItemProcessor.java
        TLItemProcessor.java
        GateDocumentItemProcessor.java
        PdfBoxItemProcessor.java
        MetadataItemProcessor.java
        TikaDocumentItemProcessor.java
        PDFGenerationItemProcessor.java
        DbLineFixerItemProcessor.java
        tika
        parsers
        TesseractOCRParser.java
        PDFPreprocessorParser.java
        config
        ImageMagickConfig.java
        jobParametersIncrementers
        TLJobParametersIncrementer.java
        batch
        JobConfiguration.java
        LocalConfiguration.java
        BatchConfigurer.java
        RemoteConfiguration.java
    - groovy
      - uk
        ac
        kcl
        model
        MultilineDocument.groovy
        PartitionParams.groovy
        ScheduledPartitionParams.groovy
        Document.groovy
    - .gradle
      - buildOutputCleanup
        built.bin
        cache.properties
        cache.properties.lock
  - test
    - resources
      - logback-test.xml
      - gate_breaker.txt
      - pdfbox
        testdocs
      - xhtml_test
      - tika
        testdocs
        docxexample.docx
        pat_id_2.docx
        encryptedWordDocx.docx
        log4j.properties
        pat_id_1.docx
        tika-config.xml
    - java
      - uk
        ac
        kcl
        partitioners
        ColumnValuePartitionerTest.java
        itemProcessors
        MetadataItemProcessorTest.java
        tika
        parsers
        PDFPreprocessorParserTest.java
    - .gradle
      - buildOutputCleanup
        built.bin
        cache.properties
        cache.properties.lock
  - integration-test
    - resources
      - gate.properties
      - logback-test.xml
      - tika_db.properties
      - noScheduling.properties
      - hsql_db.properties
      - sql_server_db_jdbc_map.properties
      - tika_filesystem.properties
      - elasticsearch.properties
      - biolark_webservice.properties
      - sql_server_db.properties
      - pdfGenerationItemProcessor.properties
      - postgres_db.properties
      - scheduling.properties
      - identifiers_small.csv
      - sql_server_test.properties
      - postgres_db_jdbc_map.properties
      - pdfBoxItemProcessor.properties
      - jobAndStep.properties
      - simple-compose-yaml
        docker-compose.yml
      - jms.properties
      - thumbnailGenerationItemProcessor.properties
      - configured_start.properties
      - jsonExamples.txt
      - magic-square.csv
      - identifiers.csv
      - metadata.properties
      - bioyodie_webservice.properties
      - deidentification.properties
      - jsonFileItemWriter.properties
      - postgres_test.properties
      - reindex.properties
    - java
      - uk
        ac
        kcl
        utils
        PostGresTestUtils.java
        DbmsTestUtils.java
        RandomString.java
        TestUtils.java
        SqlServerTestUtils.java
        MagicSquare.java
        it
        sqlserver
        ReindexWithoutScheduling.java
        TikaWithoutScheduling.java
        FullPipelineWithoutScheduling.java
        JdbcMapWithoutScheduling.java
        LineFixerWithoutScheduling.java
        DeIdentificationWithoutScheduling.java
        BasicConfigWithSchedulingLargeInsert.java
        GATEWithoutScheduling.java
        BioyodieWebserviceWithoutScheduling.java
        BasicConfigWithoutScheduling.java
        PdfboxWithoutScheduling.java
        DocmanReaderWithoutScheduling.java
        BasicConfigWithSchedulingAndConfiguredStart.java
        BiolarkWebserviceWithoutScheduling.java
        BasicConfigWithSchedulingSmallInsert.java
        postgres
        ReindexWithoutScheduling.java
        TikaWithoutScheduling.java
        FullPipelineWithoutScheduling.java
        JdbcMapWithoutScheduling.java
        LineFixerWithoutScheduling.java
        DeIdentificationWithoutScheduling.java
        BasicConfigWithSchedulingLargeInsert.java
        GATEWithoutScheduling.java
        BioyodieWebserviceWithoutScheduling.java
        BasicConfigWithoutScheduling.java
        PdfboxWithoutScheduling.java
        DocmanReaderWithoutScheduling.java
        BasicConfigWithSchedulingAndConfiguredStart.java
        BiolarkWebserviceWithoutScheduling.java
        BasicConfigWithSchedulingSmallInsert.java
        mutators
        AddressAliasMutator.java
        StringTokenTruncatorMutator.java
        SubstituteCharactersMutator.java
        NullMutator.java
        Mutator.java
        StringMutatorService.java
        Mutant.java
        BadOCRMutator.java
        testexecutionlisteners
        PostgresGateTestExecutionListener.java
        DeidTestExecutionListener.java
        BasicTestExecutionListenerLargeInsert.java
        BasicTestExecutionListenerSmallInsert.java
        PdfboxTestExecutionListener.java
        GateTestExecutionListener.java
        FullPipelineTestExecutionListener.java
        ReindexTestExecutionListener.java
        JdbcMapTestExecutionListener.java
        DocmanReaderTestExecutionListener.java
        DbLineFixerTestExecutionListener.java
        TikaTestExecutionListener.java
        testservices
        LineFixerWithoutSchedulingTests.java
        PdfboxWithoutSchedulingTests.java
        JdbcMapWithoutSchedulingTests.java
        TikaWithoutSchedulingTests.java
        DeIdentificationWithoutSchedulingTests.java
        ReindexWithoutSchedulingTests.java
        GATEWithoutSchedulingTests.java
        BiolarkWebserviceWithoutSchedulingTests.java
        BasicConfigWithSchedulingSmallInsertTests.java
        BasicConfigWithSchedulingLargeInsertTests.java
        FullPipelineWithoutSchedulingTests.java
        BasicConfigWithSchedulingAndConfiguredStartTests.java
        BioyodieWebserviceWithoutSchedulingTests.java
        HsqlTestUtils.java
        DocmanReaderWithoutSchedulingTests.java
        BasicConfigWithoutSchedulingTests.java
    - integration-test.iml
    - .gradle
      - buildOutputCleanup
        built.bin
        cache.properties
        cache.properties.lock
- HISTORY
- gradle
  - integrationtest.gradle
  - runcontainer.gradle
  - wrapper
    - gradle-wrapper.properties
    - gradle-wrapper.jar
  - acceptancetest.gradle
  - test.gradle
  - profile.writedefaultconfigs.gradle
  - profile.dev.gradle
  - buildcontainer.gradle
  - profile.run.gradle
- examples
  - example8
    - gate
      - app
        lists.def
        drug.gapp
        active.lst
    - extra
      - clean_list.py
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - setup.sh
  - prepare_docs.sh
  - docker-common
    - nginx
      - config
        nginx.conf
    - fluentd
      - conf
        fluent.conf
    - elasticsearch
      - config
        elasticsearch.yml
    - pgsamples
      - init_db.sh
    - kibana
      - config
        kibana.yml
    - pgjobrepo
      - create_repo.sh
    - docker-compose.yml
  - example2
    - extra
      - prepare_db.sh
      - db_create_schema.sql
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - setup.sh
  - example9
    - gate
      - app
        lists.def
        drug.gapp
        active.lst
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - setup.sh
  - example4
    - extra
      - prepare_single_db.sh
      - prepare_db.sh
      - db_create_schema.sql
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - .gitignore
    - setup.sh
  - example6
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
      - docker-compose.yml
    - setup.sh
  - example7
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - setup.sh
  - example5
    - extra
      - prepare_single_db.sh
      - prepare_db.sh
      - db_create_schema.sql
    - cogstack
      - run_pipeline.sh
      - conf
        step-1
        reports.properties
        step-2
        observations.properties
    - docker
      - docker-compose.override.yml
      - docker-compose.yml
    - .gitignore
    - setup.sh
  - example1
    - extra
      - prepare_db.sh
      - db_create_schema.sql
    - cogstack
      - observations.properties
    - docker
      - docker-compose.override.yml
    - setup.sh
  - rawdata
  - example3
    - extra
      - db_create_syn_schema.sql
      - prepare_mtsamples_db.sh
      - prepare_synsamples_db.sh
      - db_create_mt_schema.sql
    - cogstack
      - template.properties
      - gen_config.sh
      - mt.properties
      - .gitignore
    - docker
      - docker-compose.override.yml
    - setup.sh
  - .gitignore
  - download_db_dumps.sh
  - prepare_db_dumps.sh
- gradlew.bat
- extras
  - ImageMagick
    - policy.xml
  - dockerfiles
    - biolark
      - application-prod.yml
      - start.sh
      - Dockerfile
    - compose-ymls
      - simple
        docker-compose.yml
      - nlp
        docker-compose.yml
      - cogstack-clust
        nginx
        config
        nginx.conf
        auth
        README.md
        elasticsearch
        config
        elasticsearch.yml
        kibana
        config
        kibana.yml
        cogstack
        example.properties
        docker-compose.yml
    - java
      - Dockerfile.devel
      - Dockerfile.run
    - libre-office
      - Dockerfile
    - bioyodie
      - start.sh
      - Dockerfile
  - fig
    - KCL_boxed_redcmyk_A4-002-3.gif
    - cti-banner.jpg
  - exampleConfigs
    - gateJob.properties
    - basicJob2.properties
    - dBLineFixerJob.properties
    - bioyodieJob.properties
    - biolarkJob.properties
    - example.properties
    - deidJob.properties
    - reindexJob.properties
    - tikaJob.properties
    - basicJob.properties
    - fullPipelineJob.properties
  - localBuild
    - Dockerfile.devel
    - README.txt
    - build-containers.sh
    - Dockerfile.run
- LICENSE
- test
  - examples
    - run.sh
    - run_tests.py
    - examples_common.py
    - examples_tests.py
    - requirements.txt
    - connectors.py
  - dockerfiles
    - sqlserver
      - entrypoint.sh
      - docker-compose.yaml
      - create_schema.sql
      - Dockerfile
      - create-repo.sh
    - postgres
      - Dockerfile
      - create-repo.sh
- gradlew
- dockerfiles
  - fluentd
    - Dockerfile
    - conf
      - fluent.conf
  - gate
    - Dockerfile
- build.gradle
- CHANGELOG.txt
- .travis.yml
- README.md
- scripts
  - run_pipeline.sh
  - wait_for_service.sh
- Dockerfile
- settings.gradle
- .gitignore
- travis_gradle_build.sh
- docs
  - examples
    - _config.yml
    - Gemfile
    - index.md
    - 404.html
    - .gitignore
    - assets
      - logo-kibana.svg
  - quickstart
    - _config.yml
    - Gemfile
    - index.md
    - 404.html
    - _out_static
      - mirror.sh
      - index.html
      - .gitignore
    - .gitignore
    - assets
- .dockerignore

/* 
 * Copyright 2016 King's College London, Richard Jackson <[email protected]>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package uk.ac.kcl.tika.parsers;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.*;
import java.util.List;
import java.util.concurrent.*;

import static java.nio.charset.StandardCharsets.UTF_8;

/**
 * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
 * create a {@link TesseractOCRConfig} object and pass it through a
 * ParseContext. Tesseract-ocr must be installed and on system path or the path
 * to its root folder must be provided:
 * <p>
 * TesseractOCRConfig config = new TesseractOCRConfig();<br>
 * //Needed if tesseract is not on system path<br>
 * config.setTesseractPath(tesseractFolder);<br>
 * parseContext.set(TesseractOCRConfig.class, config);<br>
 * </p>
 *
 *
 */
public class TesseractOCRParser extends AbstractParser {
    private static final long serialVersionUID = -8167538283213097265L;
    private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
            new HashSet<MediaType>(Arrays.asList(new MediaType[] {
                    MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
                    MediaType.image("x-ms-bmp"), MediaType.image("gif")
            })));
    private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        // If Tesseract is installed, offer our supported image types
        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
        if (hasTesseract(config))
            return SUPPORTED_TYPES;

        // Otherwise don't advertise anything, so the other image parsers
        //  can be selected instead
        return Collections.emptySet();
    }

    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
        String tessdataPrefix = "TESSDATA_PREFIX";
        Map<String, String> env = pb.environment();

        if (!config.getTessdataPath().isEmpty()) {
            env.put(tessdataPrefix, config.getTessdataPath());
        }
        else if(!config.getTesseractPath().isEmpty()) {
            env.put(tessdataPrefix, config.getTesseractPath());
        }
    }

    private boolean hasTesseract(TesseractOCRConfig config) {
        // Fetch where the config says to find Tesseract
        String tesseract = config.getTesseractPath() + getTesseractProg();

        // Have we already checked for a copy of Tesseract there?
        if (TESSERACT_PRESENT.containsKey(tesseract)) {
            return TESSERACT_PRESENT.get(tesseract);
        }

        // Try running Tesseract from there, and see if it exists + works
        String[] checkCmd = { tesseract };
        boolean hasTesseract = ExternalParser.check(checkCmd);
        TESSERACT_PRESENT.put(tesseract, hasTesseract);
        return hasTesseract;

    }

    public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
            SAXException, TikaException {

        TemporaryResources tmp = new TemporaryResources();
        FileOutputStream fos = null;
        TikaInputStream tis = null;
        try {
            int w = image.getWidth(null);
            int h = image.getHeight(null);
            BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
            File file = tmp.createTemporaryFile();
            fos = new FileOutputStream(file);
            ImageIO.write(bImage, "png", fos);
            tis = TikaInputStream.get(file);
            parse(tis, handler, metadata, context);

        } finally {
            tmp.dispose();
            if (tis != null)
                tis.close();
            if (fos != null)
                fos.close();
        }

    }

    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

        // If Tesseract is not on the path with the current config, do not try to run OCR
        // getSupportedTypes shouldn't have listed us as handling it, so this should only
        //  occur if someone directly calls this parser, not via DefaultParser or similar
        if (! hasTesseract(config))
            return;

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

        TemporaryResources tmp = new TemporaryResources();
        File output = null;
        try {
            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
            File input = tikaStream.getFile();
            long size = tikaStream.getLength();

            if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

                output = tmp.createTemporaryFile();
                doOCR(input, output, config);

                // Tesseract appends .txt to output file name
                output = new File(output.getAbsolutePath() + ".txt");

                if (output.exists())
                    extractOutput(new FileInputStream(output), xhtml);

            }

            // Temporary workaround for TIKA-1445 - until we can specify
            //  composite parsers with strategies (eg Composite, Try In Turn),
            //  always send the image onwards to the regular parser to have
            //  the metadata for them extracted as well
            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
        } finally {
            tmp.dispose();
            if (output != null) {
                output.delete();
            }
        }
    }
    // TIKA-1445 workaround parser
    private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
    private static class CompositeImageParser extends CompositeParser {
        private static final long serialVersionUID = -2398203346206381382L;
        private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
                new ImageParser(), new JpegParser(), new TiffParser()
        });
        CompositeImageParser() {
            super(new MediaTypeRegistry(), imageParsers);
        }
    }

    /**
     * Run external tesseract-ocr process.
     *
     * @param input
     *          File to be ocred
     * @param output
     *          File to collect ocr result
     * @param config
     *          Configuration of tesseract-ocr engine
     * @throws TikaException
     *           if the extraction timed out
     * @throws IOException
     *           if an input error occurred
     */
    private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
                config.getLanguage(), "-psm", config.getPageSegMode() };

        ProcessBuilder pb = new ProcessBuilder(cmd);
        setEnv(config, pb);
        final Process process = pb.start();

        process.getOutputStream().close();
        InputStream out = process.getInputStream();
        InputStream err = process.getErrorStream();

        logStream("OCR MSG", out, input);
        logStream("OCR ERROR", err, input);

        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
            public Integer call() throws Exception {
                return process.waitFor();
            }
        });

        Thread waitThread = new Thread(waitTask);
        waitThread.start();

        try {
            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);

        } catch (InterruptedException e) {
            waitThread.interrupt();
            process.destroy();
            Thread.currentThread().interrupt();
            throw new TikaException("TesseractOCRParser interrupted", e);

        } catch (ExecutionException e) {
            // should not be thrown

        } catch (TimeoutException e) {
            waitThread.interrupt();
            process.destroy();
            throw new TikaException("TesseractOCRParser timeout", e);
        }
    }

    /**
     * Reads the contents of the given stream and write it to the given XHTML
     * content handler. The stream is closed once fully processed.
     *
     * @param stream
     *          Stream where is the result of ocr
     * @param xhtml
     *          XHTML content handler
     * @throws SAXException
     *           if the XHTML SAX events could not be handled
     * @throws IOException
     *           if an input error occurred
     */
    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

        xhtml.startDocument();
        xhtml.startElement("div");
        try (Reader reader = new InputStreamReader(stream, UTF_8)) {
            char[] buffer = new char[1024];
            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
                if (n > 0)
                    xhtml.characters(buffer, 0, n);
            }
        }
        xhtml.endElement("div");
        xhtml.endDocument();
    }

    /**
     * Starts a thread that reads the contents of the standard output or error
     * stream of the given process to not block the process. The stream is closed
     * once fully processed.
     */
    private void logStream(final String logType, final InputStream stream, final File file) {
        new Thread() {
            public void run() {
                Reader reader = new InputStreamReader(stream, UTF_8);
                StringBuilder out = new StringBuilder();
                char[] buffer = new char[1024];
                try {
                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
                        out.append(buffer, 0, n);
                } catch (IOException e) {

                } finally {
                    IOUtils.closeQuietly(stream);
                }

                String msg = out.toString();
                LogFactory.getLog(TesseractOCRParser.class).debug(msg);
            }
        }.start();
    }

    static String getTesseractProg() {
        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
    }

}