#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function from __future__ import absolute_import from builtins import open import argparse import os import shutil import fileinput import sys import glob import textwrap from lxml import etree import chardet from . import manual_labeling from . import training from . import parser_template from . import data_prep_utils def dispatch(): parser = argparse.ArgumentParser(description="") parser_subparsers = parser.add_subparsers() # Arguments for label command sub_label = parser_subparsers.add_parser('label') sub_label.add_argument(dest='infile', help='input csv filepath for the label task', type=file_type) sub_label.add_argument(dest='outfile', help='output xml filepath for the label task', action=XML) sub_label.add_argument(dest='module', help='parser module name', type=python_module) sub_label.set_defaults(func=label) # Arguments for train command sub_train = parser_subparsers.add_parser('train') sub_train.add_argument(dest='traindata', help='comma separated xml filepaths, or "path/to/traindata/*.xml"', type=training_data) sub_train.add_argument(dest='module', help='parser module name', type=python_module) sub_train.add_argument('--modelfile', dest='model_path', help='location of model file', action=ModelFile, required=False) sub_train.set_defaults(func=train) # Arguments for init command sub_init = parser_subparsers.add_parser('init') sub_init.add_argument(dest='modulename', help='module name for a new parser') sub_init.set_defaults(func=init) if len(sys.argv[1:]) == 0: parser.print_help() parser.exit() args = parser.parse_args() args.func(args) def label(args) : manual_labeling.label(args.module, args.infile, args.outfile, args.xml) def train(args) : training_data = args.traindata module = args.module model_path = args.model_path if model_path is None: model_path = module.__name__ + '/' +module.MODEL_FILE if hasattr(module, 'MODEL_FILES'): msg = """ NOTE: this parser allows for multiple model files You can specify a model with the --modelfile argument Models available:""" print(textwrap.dedent(msg)) for m in module.MODEL_FILES: print(" - %s" % m) print("Since no model was specified, we will train the default model") training.train(module, training_data, model_path) def init(args) : name = args.modulename data = "raw" training = "training" tests = 'tests' dirs_to_mk = [name, data, training, tests] print('\nInitializing directories for %s' %name, sys.stderr) for directory in dirs_to_mk: if not os.path.exists(directory): os.mkdir(directory) print('* %s' %directory, sys.stderr) print('\nGenerating __init__.py', sys.stderr) init_path = name + '/__init__.py' if os.path.exists(init_path): print(' warning: %s already exists' %init_path, sys.stderr) else: with open(init_path, "w") as f: f.write(parser_template.init_template()) print('* %s' %init_path) print('\nGenerating setup.py') if os.path.exists('setup.py'): print(' warning: setup.py already exists', sys.stderr) else: with open('setup.py', 'w') as f: f.write(parser_template.setup_template(name)) print('* setup.py', sys.stderr) print('\nGenerating test file', sys.stderr) token_test_path = tests+'/test_tokenizing.py' if os.path.exists(token_test_path): print(' warning: %s already exists' % token_test_path, sys.stderr) else: with open(token_test_path, 'w') as f: f.write(parser_template.test_tokenize_template(name)) print('* %s' %token_test_path, sys.stderr) class XML(argparse.Action): def __call__(self, parser, namespace, string, option_string): try: with open(string, 'r') as f: tree = etree.parse(f) xml = tree.getroot() except (OSError, IOError): xml = None except etree.XMLSyntaxError as e: if 'Document is empty' not in str(e): raise argparse.ArgumentError(self, "%s does not seem to be a valid xml file" % string) xml = None setattr(namespace, self.dest, string) setattr(namespace, 'xml', xml) def file_type(arg): try: f = open(arg, 'rb') except OSError as e: message = _("can't open '%s': %s") raise ArgumentTypeError(message % (arg, e)) else: detector = chardet.universaldetector.UniversalDetector() for line in f.readlines(): detector.feed(line) if detector.done: break f.close() detector.close() f = open(arg, 'r', encoding=detector.result['encoding']) return f def training_data(arg): all_files = [] for path in arg.split(','): all_files.extend(glob.glob(path)) xml_files = [f for f in all_files if (f.lower().endswith('.xml') and os.path.isfile(f))] if not xml_files: raise argparse.ArgumentTypeError('Please specify one or more xml training files (comma separated) [--trainfile FILE]') training_data = set() for xml_file in xml_files: with open(xml_file, 'r') as f: try: tree = etree.parse(f) except etree.XMLSyntaxError: raise argparse.ArgumentTypeError('%s is not a valid xml file' % (f.name,)) file_xml = tree.getroot() training_data.update(data_prep_utils.TrainingData(file_xml)) if not training_data: raise argparse.ArgumentTypeError("No training data found. Perhaps double check " "your training data filepaths?") msg = """ training model on {num} training examples from {file_list} file(s)""" print(textwrap.dedent(msg.format(num=len(training_data), file_list=xml_files))) return training_data class ModelFile(argparse.Action): def __call__(self, parser, namespace, model_file, option_string): module = namespace.module if hasattr(module, 'MODEL_FILES'): try: model_path = module.__name__ + '/' +module.MODEL_FILES[model_file] except KeyError: msg = """ Invalid --modelfile argument Models available: %s""" raise argparse.ArgumentTypeError(text.dedent(msg) % module.MODEL_FILES) else: raise argparse.ArgumentError(self, 'This parser does not allow for multiple models') setattr(namespace, self.dest, model_path) def python_module(arg): module = __import__(arg) return module