#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Module documentation
'''
Tika Python module provides Python API client to Aapche Tika Server.

**Example usage**::

    import tika
    from tika import parser
    parsed = parser.from_file('/path/to/file')
    print(parsed["metadata"])
    print(parsed["content"])

Visit https://github.com/chrismattmann/tika-python to learn more about it.

**Detect IANA MIME Type**::

    from tika import detector
    print(detector.from_file('/path/to/file'))

**Detect Language**::

    from tika import language
    print(language.from_file('/path/to/file'))

**Use Tika Translate**::

   from tika import translate
   print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
   # Use auto Language detection feature
   print(translate.from_file('/path/to/file', 'destLang')

***Tika-Python Configuration***
You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
for details on writing configuration files. Configuration is set the first time the server is started.
To use a configuration file with a parser, or detector:
    parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
or:
    detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
or:
    detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')

'''
import types

USAGE = """
tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>

tika.py parse all test.pdf test2.pdf                   (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
tika.py detect type test.pdf                           (returns mime-type as text/plain)
tika.py language file french.txt                       (returns language e.g., fr as text/plain)
tika.py translate fr:en french.txt                     (translates the file french.txt from french to english)
tika.py config mime-types                              (see what mime-types the Tika Server can handle)

A simple python and command-line client for Tika using the standalone Tika server (JAR file).
All commands return results in JSON format by default (except text in text/plain).

To parse docs, use:
tika.py parse <meta | text | all> <path>

To check the configuration of the Tika server, use:
tika.py config <mime-types | detectors | parsers>

Commands:
  parse  = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
  detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
  language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
  translate src:dest = parse and extract text and then translate the text from source language to destination language
  config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
             can handle, or installed detectors or parsers)

Arguments:
  urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
  
Switches:
  --verbose, -v                  = verbose mode
  --encode, -e           = encode response in UTF-8
  --csv, -c    = report detect output in comma-delimited format
  --server <TikaServerEndpoint>  = use a remote Tika Server at this endpoint, otherwise use local server
  --install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998

Example usage as python client:
-- from tika import runCommand, parse1
-- jsonOutput = runCommand('parse', 'all', filename)
 or
-- jsonOutput = parse1('all', filename)

"""

import sys, os, getopt, time, codecs, re
try:
    unicode_string = unicode 
    binary_string = str
except NameError:
    unicode_string = str
    binary_string = bytes

try:
    from urllib import urlretrieve
except ImportError:
    from urllib.request import urlretrieve
try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse as urlparse

try:
    from rfc6266 import build_header
    def make_content_disposition_header(fn):
        return build_header(os.path.basename(fn)).decode('ascii')
except ImportError:
    def make_content_disposition_header(fn):
        return 'attachment; filename=%s' % os.path.basename(fn)

if sys.version_info[0] < 3:
    open = codecs.open

import requests
import socket 
import tempfile
import hashlib
import platform
from subprocess import Popen
from subprocess import STDOUT
from os import walk
import signal
import logging
import io
import ctypes

log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
log_file = os.path.join(log_path, os.getenv('TIKA_LOG_FILE', 'tika.log'))

logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")
log = logging.getLogger('tika.tika')

if os.getenv('TIKA_LOG_FILE', 'tika.log'):
    # File logs
    fileHandler = logging.FileHandler(log_file)
    fileHandler.setFormatter(logFormatter)
    log.addHandler(fileHandler)

    # Stdout logs
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    log.addHandler(consoleHandler)

# Log level
log.setLevel(logging.INFO)

Windows = True if platform.system() == "Windows" else False
TikaVersion = os.getenv('TIKA_VERSION', '1.24')
TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
TikaFilesPath = tempfile.gettempdir()
TikaServerLogFilePath = log_path
TikaServerJar = os.getenv(
    'TIKA_SERVER_JAR',
    "http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"+TikaVersion+"/tika-server-"+TikaVersion+".jar")
ServerHost = "localhost"
Port = "9998"
ServerEndpoint = os.getenv(
    'TIKA_SERVER_ENDPOINT', 'http://' + ServerHost + ':' + Port)
Translator = os.getenv(
    'TIKA_TRANSLATOR',
    "org.apache.tika.language.translate.Lingo24Translator")
TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
TikaJava = os.getenv("TIKA_JAVA", "java")
TikaJavaArgs = os.getenv("TIKA_JAVA_ARGS", '')

Verbose = 0
EncodeUtf8 = 0
csvOutput = 0

# will be used later on to kill the process and free up ram
TikaServerProcess = False

class TikaException(Exception):
    pass

def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s)))
def warn(*s):  echo2('Warn:', *s)
def die(*s):   warn('Error:',  *s); echo2(USAGE); sys.exit()

def runCommand(cmd, option, urlOrPaths, port, outDir=None,
               serverHost=ServerHost, tikaServerJar=TikaServerJar,
               verbose=Verbose, encode=EncodeUtf8):
    '''
    Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
    :param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
    :param option:
    :param urlOrPaths:
    :param port:
    :param outDir:
    :param serverHost:
    :param tikaServerJar:
    :param verbose:
    :param encode:
    :return: response for the command, usually a ``dict``
    '''
    # import pdb; pdb.set_trace()
    if (cmd in 'parse' or cmd in 'detect') and (urlOrPaths == [] or urlOrPaths == None):
        log.exception('No URLs/paths specified.')
        raise TikaException('No URLs/paths specified.')
    serverEndpoint = 'http://' + serverHost + ':' + port
    if cmd == 'parse':
        return parseAndSave(option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar)
    elif cmd == "detect":
        return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
    elif cmd == "language":
        return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
    elif cmd == "translate":
        return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)        
    elif cmd == "config":
        status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
        return resp
    else:
        log.exception('Bad args')
        raise TikaException('Bad args')


def getPaths(urlOrPaths):
    '''
    Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
    a directory, it walks the directory and then finds all file paths in it, and ads them
    too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
    :param urlOrPaths: the url or path to be scanned
    :return: ``list`` of paths
    '''
    if isinstance(urlOrPaths, unicode_string):
        urlOrPaths = [urlOrPaths]  # do not recursively walk over letters of a single path which can include "/"
    paths = []
    for eachUrlOrPaths in urlOrPaths:
        if os.path.isdir(eachUrlOrPaths):
            for root, directories, filenames in walk(eachUrlOrPaths):
                for filename in filenames:
                    paths.append(os.path.join(root,filename))
        else:
            paths.append(eachUrlOrPaths)
    return paths

def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
                 responseMimeType='application/json', metaExtension='_meta.json',
                 services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
    '''
    Parse the objects and write extracted metadata and/or text in JSON format to matching
    filename with an extension of '_meta.json'.
    :param option:
    :param urlOrPaths:
    :param outDir:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param metaExtension:
    :param services:
    :return:
    '''
    metaPaths = []
    paths = getPaths(urlOrPaths)
    for path in paths:
        if outDir is None:
            metaPath = path + metaExtension
        else:
            metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
            log.info('Writing %s' % metaPath)
            with open(metaPath, 'w', encoding='utf-8') as f:
                f.write(parse1(option, path, serverEndpoint, verbose, tikaServerJar, \
                                    responseMimeType, services)[1] + u"\n")
        metaPaths.append(metaPath)
    return metaPaths


def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
          responseMimeType='application/json',
          services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
    '''
    Parse the objects and return extracted metadata and/or text in JSON format.
    :param option:
    :param urlOrPaths:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    return [parse1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
             for path in urlOrPaths]

def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
          responseMimeType='application/json',
          services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None, requestOptions={}):
    '''
    Parse the object and return extracted metadata and/or text in JSON format.
    :param option:
    :param urlOrPath:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :param rawResponse:
    :param headers:
    :return:
    '''
    headers = headers or {}

    path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
    headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})

    if option not in services:
        log.warning('config option must be one of meta, text, or all; using all.')
    service = services.get(option, services['all'])
    if service == '/tika': responseMimeType = 'text/plain'
    headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    with urlOrPath if _is_file_object(urlOrPath) else open(path, 'rb') as f:
        status, response = callServer('put', serverEndpoint, service, f,
                                      headers, verbose, tikaServerJar, config_path=config_path,
                                      rawResponse=rawResponse, requestOptions=requestOptions)

    if file_type == 'remote': os.unlink(path)
    return (status, response)

def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
                responseMimeType='text/plain',
                services={'file' : '/language/stream'}):
    '''
    Detect the language of the provided stream and return its 2 character code as text/plain.
    :param option:
    :param urlOrPaths:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    paths = getPaths(urlOrPaths)
    return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
            for path in paths]

def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
               responseMimeType='text/plain',
               services={'file' : '/language/stream'}, requestOptions={}):
    '''
    Detect the language of the provided stream and return its 2 character code as text/plain.
    :param option:
    :param urlOrPath:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
    if option not in services:
        log.exception('Language option must be one of %s ' % binary_string(services.keys()))
        raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
    service = services[option]
    status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
            {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
    return (status, response)

def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
                responseMimeType='text/plain',
                services={'all': '/translate/all'}):
    '''
    Translate the file from source language to destination language.
    :param option:
    :param urlOrPaths:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    paths = getPaths(urlOrPaths)
    return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
            for path in paths]
    
def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
                 responseMimeType='text/plain', 
                 services={'all': '/translate/all'}, requestOptions={}):
    '''

    :param option:
    :param urlOrPath:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
    srcLang = ""
    destLang = ""
    
    if ":" in option:
        options = option.rsplit(':')
        srcLang = options[0]
        destLang = options[1]
        if len(options) != 2:
            log.exception('Translate options are specified as srcLang:destLang or as destLang')
            raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
    else:
        destLang = option
          
    if srcLang != "" and destLang != "":
        service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
    else:
        service = services["all"] + "/" + Translator + "/" + destLang  
    status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
                                  {'Accept' : responseMimeType},
                                  verbose, tikaServerJar, requestOptions=requestOptions)
    return (status, response)
                       
def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
               responseMimeType='text/plain',
               services={'type': '/detect/stream'}):
    '''
    Detect the MIME/media type of the stream and return it in text/plain.
    :param option:
    :param urlOrPaths:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    paths = getPaths(urlOrPaths)
    return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
             for path in paths]

def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, 
               responseMimeType='text/plain',
               services={'type': '/detect/stream'}, config_path=None, requestOptions={}):
    '''
    Detect the MIME/media type of the stream and return it in text/plain.
    :param option:
    :param urlOrPath:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
    if option not in services:
        log.exception('Detect option must be one of %s' % binary_string(services.keys()))
        raise TikaException('Detect option must be one of %s' % binary_string(services.keys()))
    service = services[option]
    status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
            {
                'Accept': responseMimeType,
                'Content-Disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)
            },
            verbose, tikaServerJar, config_path=config_path, requestOptions=requestOptions)
    if csvOutput == 1:
        return(status, urlOrPath.decode("UTF-8") + "," + response)
    else:
        return (status, response)

def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
              services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}, requestOptions={}):
    '''
    Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
    :param option:
    :param serverEndpoint:
    :param verbose:
    :param tikaServerJar:
    :param responseMimeType:
    :param services:
    :return:
    '''
    if option not in services:
        die('config option must be one of mime-types, detectors, or parsers')
    service = services[option]
    status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar, requestOptions=requestOptions)
    return (status, response)

def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
               httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
                rawResponse=False,config_path=None, requestOptions={}):
    '''
    Call the Tika Server, do some error checking, and return the response.
    :param verb:
    :param serverEndpoint:
    :param service:
    :param data:
    :param headers:
    :param verbose:
    :param tikaServerJar:
    :param httpVerbs:
    :param classpath:
    :return:
    '''
    parsedUrl = urlparse(serverEndpoint) 
    serverHost = parsedUrl.hostname
    scheme = parsedUrl.scheme

    port = parsedUrl.port
    if classpath is None:
        classpath = TikaServerClasspath
    
    global TikaClientOnly
    if not TikaClientOnly:
        serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)

    serviceUrl  = serverEndpoint + service
    if verb not in httpVerbs:
        log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
        raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
    verbFn = httpVerbs[verb]

    if Windows and hasattr(data, "read"):
        data = data.read()
        
    encodedData = data
    if type(data) is unicode_string:
        encodedData = data.encode('utf-8')

    requestOptionsDefault = {
        'timeout': 60,
        'headers': headers,
        'verify': False
    }
    effectiveRequestOptions = requestOptionsDefault.copy()
    effectiveRequestOptions.update(requestOptions)

    resp = verbFn(serviceUrl, encodedData, **effectiveRequestOptions)

    if verbose:
        print(sys.stderr, "Request headers: ", headers)
        print(sys.stderr, "Response headers: ", resp.headers)
    if resp.status_code != 200:
        log.warning('Tika server returned status: %d', resp.status_code)

    resp.encoding = "utf-8"
    if rawResponse:
        return (resp.status_code, resp.content)
    else:
        return (resp.status_code, resp.text)


def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
    '''
    Check that tika-server is running.  If not, download JAR file and start it up.
    :param scheme: e.g. http or https
    :param serverHost:
    :param port:
    :param tikaServerJar:
    :param classpath:
    :return:
    '''
    if classpath is None:
        classpath = TikaServerClasspath
    if port is None:
        port = '443' if scheme == 'https' else '80'

    urlp = urlparse(tikaServerJar)
    serverEndpoint = '%s://%s:%s' % (scheme, serverHost, port)
    jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
    if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
        alreadyRunning = checkPortIsOpen(serverHost, port)

        if not alreadyRunning:
            if not os.path.isfile(jarPath) and urlp.scheme != '':
                getRemoteJar(tikaServerJar, jarPath)

            if not checkJarSig(tikaServerJar, jarPath):
                os.remove(jarPath)
                tikaServerJar = getRemoteJar(tikaServerJar, jarPath)

            status = startServer(jarPath, TikaJava, TikaJavaArgs, serverHost, port, classpath, config_path)
            if not status:
                log.error("Failed to receive startup confirmation from startServer.")
                raise RuntimeError("Unable to start Tika server.")
    return serverEndpoint

def checkJarSig(tikaServerJar, jarPath):
    '''
    Checks the signature of Jar
    :param tikaServerJar:
    :param jarPath:
    :return: ``True`` if the signature of the jar matches
    '''
    if not os.path.isfile(jarPath + ".md5"):
        getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
    m = hashlib.md5()
    with open(jarPath, 'rb') as f:
        binContents = f.read()
        m.update(binContents)
        with open(jarPath + ".md5", "r") as em:
            existingContents = em.read()
            return existingContents == m.hexdigest()


def startServer(tikaServerJar, java_path = TikaJava, java_args = TikaJavaArgs, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
    '''
    Starts Tika Server
    :param tikaServerJar: path to tika server jar
    :param serverHost: the host interface address to be used for binding the service
    :param port: the host port to be used for binding the service
    :param classpath: Class path value to pass to JVM
    :return: None
    '''
    if classpath is None:
        classpath = TikaServerClasspath

    host = "localhost"
    if Windows:
        host = "0.0.0.0"

    if classpath:
        classpath += ":" + tikaServerJar
    else:
        classpath = tikaServerJar

    # setup command string
    cmd_string = ""
    if not config_path:
        cmd_string = '%s %s -cp "%s" org.apache.tika.server.TikaServerCli --port %s --host %s &' \
                     % (java_path, java_args, classpath, port, host)
    else:
        cmd_string = '%s %s -cp "%s" org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &' \
                     % (java_path, java_args, classpath, port, host, config_path)

    # Check that we can write to log path
    try:
        tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
        logFile = open(tika_log_file_path, 'w')
    except PermissionError as e:
        log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
        return False

    # Check that specified java binary is available on path
    try:
        _ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
    except FileNotFoundError as e:
        log.error("Unable to run java; is it installed?")
        return False

    # Run java with jar args
    global TikaServerProcess
    # Patch for Windows support
    if Windows:
        if sys.version.startswith("2"):
            # Python 2.x     
            TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
        elif sys.version.startswith("3"):
            # Python 3.x
            TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True, start_new_session=True)
    else:
        TikaServerProcess = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True, preexec_fn=os.setsid)

    # Check logs and retry as configured
    try_count = 0
    is_started = False
    while try_count < TikaStartupMaxRetry:
        with open(tika_log_file_path, "r") as tika_log_file_tmp:
            # check for INFO string to confirm listening endpoint
            if "Started Apache Tika server at" in tika_log_file_tmp.read():
                is_started = True
            else:
                log.warning("Failed to see startup log message; retrying...")
        time.sleep(TikaStartupSleep)
        try_count += 1

    if not is_started:
        log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
        return False
    else:
        return True

def killServer():
    '''
    Kills the tika server started by the current execution instance
    '''
    if(TikaServerProcess):
        try:
            os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
        except:
            log.error("Failed to kill the current server session")    
        time.sleep(1)
        # patch to support subprocess killing for windows
        if Windows:
            if sys.version.startswith("2"):
                # Python 2.x
                PROCESS_TERMINATE = 1
                handle = ctypes.windll.kernel32.OpenProcess(PROCESS_TERMINATE, False, TikaServerProcess.pid)
                ctypes.windll.kernel32.TerminateProcess(handle, -1)
                ctypes.windll.kernel32.CloseHandle(handle)
                time.sleep(1)
            elif sys.version.startswith("3"):
                # Python 3.x
                os.kill(TikaServerProcess.pid, signal.SIGTERM)
                time.sleep(1)
        else:
            try:
                os.killpg(os.getpgid(TikaServerProcess.pid), signal.SIGTERM)
            except:
                log.error("Failed to kill the current server session")    
            time.sleep(1)
    else:
        log.error("Server not running, or was already running before")

def toFilename(url):
    '''
    gets url and returns filename
    '''
    urlp = urlparse(url)
    path = urlp.path
    if not path:
        path = "file_{}".format(int(time.time()))
    value = re.sub(r'[^\w\s\.\-]', '-', path).strip().lower()
    return re.sub(r'[-\s]+', '-', value).strip("-")[-200:]


def _is_file_object(f):
    try:
        file_types = (types.FileType, io.IOBase)
    except AttributeError:
        file_types = (io.IOBase,)

    return isinstance(f, file_types)

def getRemoteFile(urlOrPath, destPath):
    '''
    Fetches URL to local path or just returns absolute path.
    :param urlOrPath: resource locator, generally URL or path
    :param destPath: path to store the resource, usually a path on file system
    :return: tuple having (path, 'local'/'remote'/'binary')
    '''
    # handle binary stream input
    if _is_file_object(urlOrPath):
        return (urlOrPath.name, 'binary')

    urlp = urlparse(urlOrPath)
    if urlp.scheme == '':
        return (os.path.abspath(urlOrPath), 'local')
    elif urlp.scheme not in ('http', 'https'):
        return (urlOrPath, 'local')
    else:
        filename = toFilename(urlOrPath)
        destPath = destPath + '/' + filename
        log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
        try:
            urlretrieve(urlOrPath, destPath)
        except IOError:
            # monkey patch fix for SSL/Windows per Tika-Python #54 
            # https://github.com/chrismattmann/tika-python/issues/54
            import ssl
            if hasattr(ssl, '_create_unverified_context'):
                ssl._create_default_https_context = ssl._create_unverified_context
            # delete whatever we had there
            if os.path.exists(destPath) and os.path.isfile(destPath):
                os.remove(destPath)
            urlretrieve(urlOrPath, destPath)
        return (destPath, 'remote')

def getRemoteJar(urlOrPath, destPath):
    '''
    Fetches URL to local path or just return absolute path.
    :param urlOrPath: remote resource locator
    :param destPath: Path to store the resource, usually a path on file system
    :return: tuple having (path, 'local'/'remote')
    '''
    urlp = urlparse(urlOrPath)
    if urlp.scheme == '':
        return (os.path.abspath(urlOrPath), 'local')
    else:
        log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
        try:
            urlretrieve(urlOrPath, destPath)
        except IOError:
            # monkey patch fix for SSL/Windows per Tika-Python #54 
            # https://github.com/chrismattmann/tika-python/issues/54
            import ssl
            if hasattr(ssl, '_create_unverified_context'):
                ssl._create_default_https_context = ssl._create_unverified_context
            # delete whatever we had there
            if os.path.exists(destPath) and os.path.isfile(destPath):
                os.remove(destPath)
            urlretrieve(urlOrPath, destPath) 
               
        return (destPath, 'remote')
    
def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
    '''
    Checks if the specified port is open
    :param remoteServerHost: the host address
    :param port: port which needs to be checked
    :return: ``True`` if port is open, ``False`` otherwise
    '''
    remoteServerIP  = socket.gethostbyname(remoteServerHost)
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        result = sock.connect_ex((remoteServerIP, int(port)))
        if result == 0:
            return True
        else :
            return False

    except KeyboardInterrupt:
        print("You pressed Ctrl+C")
        sys.exit()

    except socket.gaierror:
        print('Hostname could not be resolved. Exiting')
        sys.exit()

    except socket.error:
        print("Couldn't connect to server")
        sys.exit()

    finally:
        sock.close()

def main(argv=None):
    """Run Tika from command line according to USAGE."""
    global Verbose
    global EncodeUtf8
    global csvOutput
    if argv is None:
        argv = sys.argv

    if (len(argv) < 3 and not (('-h' in argv) or ('--help' in argv))):
        log.exception('Bad args')
        raise TikaException('Bad args')
    try:
        opts, argv = getopt.getopt(argv[1:], 'hi:s:o:p:v:e:c',
          ['help', 'install=', 'server=', 'output=', 'port=', 'verbose', 'encode', 'csv'])
    except getopt.GetoptError as opt_error:
        msg, bad_opt = opt_error
        log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
        raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))

    tikaServerJar = TikaServerJar
    serverHost = ServerHost
    outDir = '.'
    port = Port
    for opt, val in opts:
        if opt   in ('-h', '--help'):    echo2(USAGE); sys.exit()
        elif opt in ('--install'):       tikaServerJar = val
        elif opt in ('--server'):        serverHost = val
        elif opt in ('-o', '--output'):  outDir = val
        elif opt in ('--port'):          port = val
        elif opt in ('-v', '--verbose'): Verbose = 1
        elif opt in ('-e', '--encode'): EncodeUtf8 = 1
        elif opt in ('-c', '--csv'): csvOutput = 1
        else:
            raise TikaException(USAGE)

    cmd = argv[0]
    option = argv[1]
    try:
        paths = argv[2:]
    except:
        paths = None
    return runCommand(cmd, option, paths, port, outDir, serverHost=serverHost, tikaServerJar=tikaServerJar, verbose=Verbose, encode=EncodeUtf8)


if __name__ == '__main__':
    log.info("Logging on '%s'" % (log_file))
    resp = main(sys.argv)

    # Set encoding of the terminal to UTF-8
    if sys.version.startswith("2"):
        # Python 2.x
        out = codecs.getwriter("UTF-8")(sys.stdout)
    elif sys.version.startswith("3"):
        # Python 3.x
        out = codecs.getwriter("UTF-8")(sys.stdout.buffer)

    if type(resp) == list:
        out.write('\n'.join([r[1] for r in resp]))
    else:
        out.write(resp)
    out.write('\n')