python source code of data

Project: nlp-architect (GitHub Link)

nlp-architect-master
- .github
  - ISSUE_TEMPLATE
    - question.md
    - feature-request-improvement.md
    - bug_report.md
  - issue_template.md
  - workflows
    - ci.yml
    - stale.yml
- datasets
  - H2H requests detection
    - Human-to-Human Actionable Requests Dataset.csv
  - wikipedia
    - enwiki-20171201_subset_license.txt
  - absa
  - ecb
- .jenkins
  - Jenkinsfile
- examples
  - word_language_model_with_tcn
    - adding_problem
      - adding_with_tcn.py
      - __init__.py
      - README.md
      - adding_model.py
    - mle_language_model
      - language_modeling_with_tcn.py
      - lm_model.py
      - __init__.py
    - images
    - toy_data
      - adding.py
    - __init__.py
    - README.md
  - np_semantic_segmentation
    - data.py
    - feature_extraction.py
    - preprocess_tratz2011.py
    - train.py
    - __init__.py
    - README.md
    - inference.py
  - crosslingembs
    - evaluate.py
    - train.py
    - README.md
  - np2vec
    - train.py
    - __init__.py
    - README.md
    - inference.py
  - cross_doc_coref
    - relation_extraction_example.py
    - __init__.py
    - cross_doc_coref_sieves.py
  - chunker
    - train.py
    - __init__.py
    - README.md
    - inference.py
  - absa
    - solution
      - absa_solution.py
      - __init__.py
    - aspects.csv
    - opinions.csv
    - __init__.py
    - inference
      - __init__.py
      - inference.py
      - interactive.py
    - train
      - train.py
      - __init__.py
  - ner
    - train.py
    - __init__.py
    - README.md
    - interactive.py
  - most_common_word_sense
    - prepare_data.py
    - feature_extraction.py
    - data
      - input_inference_examples.csv
    - train.py
    - __init__.py
    - README.md
    - inference.py
  - reading_comprehension
    - match_lstm_mrc
      - machine_comprehension_api.py
      - prepare_data.py
      - mrc_utils.py
      - train.py
      - __init__.py
      - README.md
      - matchlstm_ansptr.py
    - __init__.py
  - intent_extraction
    - train_seq2seq_model.py
    - train_mtl_model.py
    - README.md
    - interactive.py
  - __init__.py
  - supervised_sentiment
    - example_ensemble.py
    - test_ensembler.py
    - amazon_reviews.py
    - optimize_example.py
    - __init__.py
    - README.md
    - supervised_sentiment.py
    - ensembler.py
  - requirements.txt
  - memn2n_dialogue
    - interactive_utils.py
    - memn2n_dialogue.py
    - babi_dialog.py
    - train_model.py
    - __init__.py
    - README.md
    - interactive.py
  - sparse_gnmt
    - gnmt
      - utils
        standard_hparams_utils.py
        iterator_utils.py
        nmt_utils.py
        evaluation_utils.py
        rouge.py
        __init__.py
        misc_utils.py
        bleu.py
        vocab_utils.py
      - attention_model.py
      - model.py
      - __init__.py
      - model_helper.py
    - nmt.py
    - train.py
    - __init__.py
    - README.md
    - inference.py
    - gnmt_model.py
    - standard_hparams
      - wmt16_gnmt_4_layer.json
      - sparse_wmt16_gnmt_4_layer.json
- licenses
  - displacy_ent-license.txt
  - hyperopt-license.txt
  - keras_contrib-license.txt
  - matplotlib-license.txt
  - spacy-license.txt
  - bootstrap-license.txt
  - nbformat-license.txt
  - ipython-license.txt
  - pep8-license.txt
  - beautifulsoap-license.txt
  - dynet-license.txt
  - neon-license.txt
  - future-license.txt
  - conlleval-license.txt
  - pandas-license.txt
  - nltk-license.txt
  - scikit_learn-license.txt
  - pytorch-transformers-license.txt
  - jupyter-license.txt
  - displacy-license.txt
  - pillow-license.txt
  - sphinx-license.txt
  - ipywidgets-license.txt
  - textacy-license.txt
  - falcon-license.txt
  - sphinx_rtd_theme-license.txt
  - bokeh-license.txt
  - ipdb-license.txt
  - scipy-license.txt
  - requests-license.txt
  - h5py-license.txt
  - hug-license.txt
  - ftfy-license.txt
  - pytest_mock-license.txt
  - pylint-license.txt
  - tqdm-license.txt
  - pyyaml-license.txt
  - setuptools-license.txt
  - facebook_memnn-license.txt
  - gensim-license.txt
  - flake8-license.txt
  - nmt-license.txt
  - termcolor-license.txt
  - bist_parser-license.txt
  - numpy-license.txt
  - tensorflow-license.txt
  - pytorch-license.txt
  - pytest-license.txt
  - six-license.txt
  - pytest_cov-license.txt
  - ngraph_python-license.txt
  - falcon_multipart-license.txt
  - configArgParse-license.txt
  - keras-license.txt
  - ansi2html-license.txt
- 3rd Party Notice.txt
- CNAME
- LICENSE
- docs-source
  - Makefile
  - source
    - _quick_install.rst
    - word_sense.rst
    - absa_solution.rst
    - absa.rst
    - transformers_distillation.rst
    - code_docs
      - nlp_architect.api.rst
    - tcn.rst
    - quick_start.rst
    - writing_tests.rst
    - term_set_expansion.rst
    - spacy_bist.rst
    - archived
      - additional.rst
      - crosslingual_emb.rst
      - reading_comprehension.rst
      - memn2n.rst
    - bist_parser.rst
    - intent.rst
    - information_extraction.rst
    - supervised_sentiment.rst
    - sentiment.rst
    - transformers.rst
    - cross_doc_coref.rst
    - trend_analysis.rst
    - quantized_bert.rst
    - tagging
      - ner.rst
      - chunker.rst
      - ner_crf.rst
      - sequence_tagging.rst
    - generated_api
      - nlp_architect.common.rst
      - nlp_architect.models.absa.inference.rst
      - nlp_architect.utils.rst
      - nlp_architect.data.cdc_resources.relations.rst
      - nlp_architect.data.cdc_resources.wikipedia.rst
      - nlp_architect.data.cdc_resources.data_types.rst
      - nlp_architect.procedures.rst
      - nlp_architect.nn.torch.data.rst
      - nlp_architect.nn.torch.layers.rst
      - nlp_architect.models.transformers.rst
      - nlp_architect.models.bist.rst
      - nlp_architect.models.rst
      - nlp_architect.api.rst
      - nlp_architect_api_index.rst
      - nlp_architect.data.cdc_resources.gen_scripts.rst
      - nlp_architect.utils.resources.rst
      - nlp_architect.models.bist.eval.conllu.rst
      - nlp_architect.models.bist.eval.rst
      - nlp_architect.data.cdc_resources.embedding.rst
      - nlp_architect.data.cdc_resources.wordnet.rst
      - nlp_architect.nn.tensorflow.rst
      - nlp_architect.nn.rst
      - nlp_architect.pipelines.rst
      - nlp_architect.models.cross_doc_coref.system.sieves.rst
      - nlp_architect.cli.rst
      - nlp_architect.nn.torch.modules.rst
      - nlp_architect.nn.tensorflow.python.rst
      - nlp_architect.nn.tensorflow.python.keras.utils.rst
      - nlp_architect.nn.tensorflow.python.keras.rst
      - nlp_architect.nn.tensorflow.python.keras.layers.rst
      - nlp_architect.data.cdc_resources.rst
      - nlp_architect.common.cdc.rst
      - nlp_architect.models.cross_doc_coref.system.rst
      - nlp_architect.data.cdc_resources.data_types.wn.rst
      - nlp_architect.nlp.rst
      - nlp_architect.models.absa.rst
      - nlp_architect.procedures.transformers.rst
      - nlp_architect.data.cdc_resources.data_types.wiki.rst
      - nlp_architect.models.absa.train.rst
      - nlp_architect.nn.torch.rst
      - nlp_architect.models.cross_doc_coref.rst
      - nlp_architect.data.rst
    - developer_guide.rst
    - spacy_np_annotator.rst
    - model_zoo.rst
    - lm.rst
    - main.rst
    - identifying_semantic_relation.rst
    - CONTRIBUTING.rst
    - np2vec.rst
    - tutorials.rst
    - index.rst
    - conf.py
    - np_segmentation.rst
    - static
      - install.js
      - nlp_arch_theme.css
    - sparse_gnmt.rst
    - installation.rst
    - assets
    - publications.rst
- setup.py
- docker
  - README.md
  - Dockerfile
- solutions
  - start_ui.py
  - absa_solution
    - style
      - bar_chart.css
      - lexicon_manager.css
    - dropdown.js
    - sentiment_solution.py
    - ui.py
    - test_absa_solution.py
    - __init__.py
    - requirements.txt
    - assets
  - set_expansion
    - ui
      - download.js
      - templates
        index.html
      - __init__.py
      - main.py
      - settings.py
      - static
        css
        styles.css
    - prepare_data.py
    - expand_server.py
    - __init__.py
    - README.md
    - set_expand.py
    - requirements.txt
  - trend_analysis
    - test_np_scorer.py
    - topic_extraction.py
    - scoring_utils.py
    - ui
      - __init__.py
    - trend_analysis.py
    - np_scorer.py
    - test_trend_analysis.py
    - __init__.py
    - README.md
    - requirements.txt
    - static
      - css
        styles.css
    - ui_main.py
    - assets
  - __init__.py
- tutorials
  - np_semantic_segmentation
    - np_semantic_segmentation_demo.ipynb
  - ner
    - ner_demo.ipynb
  - NP2vec
    - NP2vec_training.ipynb
  - sentiment
    - Deep_Learning_Sentiment_Demo.ipynb
  - intent_extraction
    - intent_extraction_demo.ipynb
  - Term_Set_Expansion
    - term_set_expansion.ipynb
  - Question_Answering
    - Natural_Language_Question_Answer_Systems.ipynb
- setup.cfg
- README.md
- tests
  - test_utils_embedding.py
  - test_spacy_bist.py
  - test_string_utils.py
  - cdc
    - test_utils.py
    - __init__.py
  - fixtures
    - conll_sample
      - labels.txt
      - data.txt
    - data
      - absa_solution
        expected.csv
      - absa
        core_nlp_doc_3.json
        core_nlp_doc_1.json
        sentiment_doc_2.json
        core_nlp_doc_2.json
        sentiment_doc_1.json
        sentiment_doc_3.json
      - trend_analysis
        target_corpus
        text1.txt
        text2.txt
        reference_corpus
        text1.txt
        text2.txt
      - server
        bist_sentences_examples.json
        tests_data.json
        bist_sentences_examples.json.gz
        ner_sentences_examples.json.gz
        ner_sentences_examples.json
  - test_tcn.py
  - test_data_utils.py
  - test_spacy_np_annotator.py
  - test_server_sanity.py
  - __init__.py
  - utils.py
  - test_absa.py
  - test_quantization.py
  - test_ner_taggers.py
- pytest.ini
- scripts
  - run_tests.sh
  - create_docs.sh
  - check_black.sh
  - server.py
  - check_flake.sh
  - publish_on_pypi.sh
  - run_demo_server.sh
  - check_pylint.sh
- server
  - services.json
  - angular-ui
    - dist
      - angular-ui
        polyfills.d1c7bf4a2ae7c3435f95.js
        styles.af6aa4fdea1e1aed959b.css
        3rdpartylicenses.txt
        runtime.ec2944dd8b20ec099bf3.js
        index.html
        assets
        images
        spacy_ner_screenshot.PNG
        ner_screenshot.PNG
    - __init__.py
  - serve.py
  - __init__.py
  - README.md
  - requirements.txt
  - service.py
- requirements.txt
- pylintrc
- .gitignore
- nlp_architect
  - nlp-inference
  - utils
    - testing.py
    - resources
      - stop_words_en.json
      - determiners_en.json
      - stopwords.txt
      - preposition_en.json
      - pronoun_en.json
      - __init__.py
    - text.py
    - metrics.py
    - file_cache.py
    - embedding.py
    - generic.py
    - string_utils.py
    - io.py
    - __init__.py
    - ansi2html.py
  - nlp
    - __init__.py
  - version.py
  - api
    - intent_extraction_api.py
    - ner_api.py
    - abstract_api.py
    - __init__.py
    - bist_parser_api.py
    - base.py
  - cli
    - cmd_registry.py
    - __init__.py
  - models
    - intent_extraction.py
    - transformers
      - token_classification.py
      - quantized_bert.py
      - base_model.py
      - sequence_classification.py
      - __init__.py
    - bist_parser.py
    - cross_doc_sieves.py
    - cross_doc_coref
      - sieves_resource.py
      - sieves_config.py
      - __init__.py
      - system
        sieves
        run_sieve_system.py
        sieves.py
        __init__.py
        cdc_utils.py
        sieves_container_init.py
        __init__.py
    - tagging.py
    - bist
      - decoder.py
      - mstlstm.py
      - eval
        eval.pl
        __init__.py
        conllu
        weights.clas
        conll17_ud_eval.py
        __init__.py
      - __init__.py
      - utils.py
      - NOTICE
    - np2vec.py
    - absa
      - __init__.py
      - utils.py
      - inference
        data_types.py
        __init__.py
        inference.py
        lexicons
        IntensifiersLex.csv
        NegationSentLex.csv
      - train
        rules.py
        generate_lexicons.py
        rerank_terms.py
        acquire_terms.py
        data_types.py
        train.py
        __init__.py
        lexicons
        OrdinalNumbersLex.csv
        NegationLex.csv
        DeterminersLex.csv
        ColorsLex.csv
        GenericOpinionLex.csv
        AuxiliariesLex.csv
        IntensifiersLex.csv
        GenericQuantifiersLex.csv
        RerankTrainingData.csv
        GeneralAdjectivesLex.csv
        PrepositionsLex.csv
        TimeAdjectiveLex.csv
        GeographicalAdjectivesLex.csv
        PronounsLex.csv
    - temporal_convolutional_network.py
    - crossling_emb.py
    - np_semantic_segmentation.py
    - pretrained_models.py
    - __init__.py
    - ner_crf.py
    - most_common_word_sense.py
    - chunker.py
  - common
    - cdc
      - __init__.py
      - mention_data.py
      - cluster.py
      - topics.py
    - config.py
    - core_nlp_doc.py
    - high_level_doc.py
    - __init__.py
  - nn
    - torch
      - distillation.py
      - quantization.py
      - layers
        crf.py
        __init__.py
      - data
        __init__.py
        dataset.py
      - __init__.py
      - modules
        embedders.py
        __init__.py
    - __init__.py
    - tensorflow
      - python
        __init__.py
        keras
        utils
        layer_utils.py
        __init__.py
        layers
        crf.py
        __init__.py
        __init__.py
        callbacks.py
      - __init__.py
  - data
    - sequential_tagging.py
    - cdc_resources
      - relations
        wordnet_relation_extraction.py
        relation_extraction.py
        within_doc_coref_extraction.py
        referent_dict_relation_extraction.py
        word_embedding_relation_extraction.py
        __init__.py
        relation_types_enums.py
        wikipedia_relation_extraction.py
        verbocean_relation_extraction.py
        computed_relation_extraction.py
      - wikipedia
        wiki_elastic.py
        wiki_online.py
        __init__.py
        wiki_offline.py
        wiki_search_page_result.py
      - gen_scripts
        create_word_embed_glove_dump.py
        create_reference_dict_dump.py
        create_verbocean_dump.py
        create_word_embed_elmo_dump.py
        __init__.py
        create_wiki_dump.py
        create_wordnet_dump.py
      - __init__.py
      - wordnet
        wordnet_online.py
        __init__.py
        wordnet_offline.py
      - embedding
        embed_elmo.py
        embed_glove.py
        __init__.py
      - data_types
        wn
        __init__.py
        wordnet_page.py
        __init__.py
        wiki
        __init__.py
        wikipedia_pages.py
        wikipedia_page.py
        wikipedia_page_extracted_relations.py
    - ptb.py
    - fasttext_emb.py
    - conll.py
    - intent_datasets.py
    - glue_tasks.py
    - sequence_classification.py
    - __init__.py
    - utils.py
    - NOTICE
  - pipelines
    - spacy_np_annotator.py
    - spacy_bist.py
    - __init__.py
  - __init__.py
  - nlp-train
  - procedures
    - transformers
      - seq_tag.py
      - glue.py
      - __init__.py
      - base.py
    - registry.py
    - token_tagging.py
    - __init__.py
    - procedure.py
  - nlp_architect
- docs
  - spacy_bist.html
  - np_segmentation.html
  - installation.html
  - np2vec.html
  - word_sense.html
  - main.html
  - .nojekyll
  - _quick_install.html
  - code_docs
    - nlp_architect.api.html
  - supervised_sentiment.html
  - intent.html
  - tutorials.html
  - quick_start.html
  - cross_doc_coref.html
  - _sources
    - writing_tests.rst.txt
    - np2vec.rst.txt
    - index.rst.txt
    - tutorials.rst.txt
    - identifying_semantic_relation.rst.txt
    - publications.rst.txt
    - developer_guide.rst.txt
    - bist_parser.rst.txt
    - main.rst.txt
    - _quick_install.rst.txt
    - code_docs
      - nlp_architect.api.rst.txt
    - trend_analysis.rst.txt
    - absa_solution.rst.txt
    - intent.rst.txt
    - spacy_bist.rst.txt
    - quantized_bert.rst.txt
    - model_zoo.rst.txt
    - sentiment.rst.txt
    - archived
      - reading_comprehension.rst.txt
      - crosslingual_emb.rst.txt
      - memn2n.rst.txt
      - additional.rst.txt
    - tcn.rst.txt
    - spacy_np_annotator.rst.txt
    - sparse_gnmt.rst.txt
    - transformers.rst.txt
    - supervised_sentiment.rst.txt
    - CONTRIBUTING.rst.txt
    - absa.rst.txt
    - tagging
      - ner.rst.txt
      - sequence_tagging.rst.txt
      - ner_crf.rst.txt
      - chunker.rst.txt
    - generated_api
      - nlp_architect.utils.rst.txt
      - nlp_architect.data.cdc_resources.relations.rst.txt
      - nlp_architect_api_index.rst.txt
      - nlp_architect.models.transformers.rst.txt
      - nlp_architect.models.absa.inference.rst.txt
      - nlp_architect.data.cdc_resources.wordnet.rst.txt
      - nlp_architect.pipelines.rst.txt
      - nlp_architect.nn.tensorflow.rst.txt
      - nlp_architect.models.absa.rst.txt
      - nlp_architect.data.cdc_resources.data_types.rst.txt
      - nlp_architect.models.cross_doc_coref.system.sieves.rst.txt
      - nlp_architect.procedures.transformers.rst.txt
      - nlp_architect.nn.torch.layers.rst.txt
      - nlp_architect.models.bist.rst.txt
      - nlp_architect.utils.resources.rst.txt
      - nlp_architect.data.cdc_resources.data_types.wn.rst.txt
      - nlp_architect.data.cdc_resources.embedding.rst.txt
      - nlp_architect.nn.tensorflow.python.rst.txt
      - nlp_architect.nn.tensorflow.python.keras.utils.rst.txt
      - nlp_architect.nn.torch.data.rst.txt
      - nlp_architect.cli.rst.txt
      - nlp_architect.nn.tensorflow.python.keras.layers.rst.txt
      - nlp_architect.data.cdc_resources.gen_scripts.rst.txt
      - nlp_architect.models.rst.txt
      - nlp_architect.nn.torch.modules.rst.txt
      - nlp_architect.data.cdc_resources.rst.txt
      - nlp_architect.models.bist.eval.rst.txt
      - nlp_architect.data.rst.txt
      - nlp_architect.models.bist.eval.conllu.rst.txt
      - nlp_architect.nlp.rst.txt
      - nlp_architect.procedures.rst.txt
      - nlp_architect.data.cdc_resources.wikipedia.rst.txt
      - nlp_architect.nn.torch.rst.txt
      - nlp_architect.nn.rst.txt
      - nlp_architect.models.absa.train.rst.txt
      - nlp_architect.common.cdc.rst.txt
      - nlp_architect.api.rst.txt
      - nlp_architect.nn.tensorflow.python.keras.rst.txt
      - nlp_architect.models.cross_doc_coref.rst.txt
      - nlp_architect.models.cross_doc_coref.system.rst.txt
      - nlp_architect.data.cdc_resources.data_types.wiki.rst.txt
      - nlp_architect.common.rst.txt
    - term_set_expansion.rst.txt
    - word_sense.rst.txt
    - cross_doc_coref.rst.txt
    - quick_start.rst.txt
    - transformers_distillation.rst.txt
    - information_extraction.rst.txt
    - lm.rst.txt
    - installation.rst.txt
    - np_segmentation.rst.txt
  - searchindex.js
  - archived
    - crosslingual_emb.html
    - reading_comprehension.html
    - memn2n.html
    - additional.html
  - publications.html
  - tcn.html
  - CNAME
  - transformers_distillation.html
  - CONTRIBUTING.html
  - information_extraction.html
  - _modules
    - transformers
      - configuration_roberta.html
    - index.html
    - nlp_architect
      - utils
        generic.html
        io.html
        metrics.html
        testing.html
        text.html
        string_utils.html
        ansi2html.html
        file_cache.html
        embedding.html
      - api
        abstract_api.html
        ner_api.html
        base.html
        bist_parser_api.html
        intent_extraction_api.html
      - cli
        cli_commands.html
      - models
        transformers
        base_model.html
        sequence_classification.html
        token_classification.html
        quantized_bert.html
        tagging.html
        ner_crf.html
        np2vec.html
        cross_doc_coref
        sieves_config.html
        sieves_resource.html
        system
        sieves
        run_sieve_system.html
        sieves.html
        sieves_container_init.html
        cdc_utils.html
        pretrained_models.html
        bist
        utils.html
        eval
        conllu
        conll17_ud_eval.html
        mstlstm.html
        decoder.html
        absa
        utils.html
        inference
        inference.html
        data_types.html
        train
        train.html
        rerank_terms.html
        generate_lexicons.html
        data_types.html
        rules.html
        acquire_terms.html
        intent_extraction.html
        bist_parser.html
        most_common_word_sense.html
        chunker.html
        np_semantic_segmentation.html
        crossling_emb.html
        temporal_convolutional_network.html
        cross_doc_sieves.html
      - models.html
      - common
        config.html
        cdc
        cluster.html
        topics.html
        mention_data.html
        core_nlp_doc.html
        high_level_doc.html
      - nn
        torch.html
        torch
        layers
        crf.html
        quantization.html
        distillation.html
        data
        dataset.html
        modules
        embedders.html
        tensorflow
        python
        keras
        utils
        layer_utils.html
        callbacks.html
        layers
        crf.html
      - data
        sequence_classification.html
        sequential_tagging.html
        cdc_resources
        relations
        wikipedia_relation_extraction.html
        relation_extraction.html
        computed_relation_extraction.html
        wordnet_relation_extraction.html
        referent_dict_relation_extraction.html
        verbocean_relation_extraction.html
        relation_types_enums.html
        word_embedding_relation_extraction.html
        within_doc_coref_extraction.html
        wikipedia
        wiki_online.html
        wiki_search_page_result.html
        wiki_offline.html
        wiki_elastic.html
        gen_scripts
        create_word_embed_elmo_dump.html
        wordnet
        wordnet_offline.html
        wordnet_online.html
        embedding
        embed_glove.html
        embed_elmo.html
        data_types
        wn
        wordnet_page.html
        wiki
        wikipedia_page_extracted_relations.html
        wikipedia_pages.html
        wikipedia_page.html
        ptb.html
        intent_datasets.html
        utils.html
        fasttext_emb.html
        conll.html
        glue_tasks.html
      - pipelines
        spacy_bist.html
        spacy_np_annotator.html
      - procedures
        transformers
        glue.html
        base.html
        seq_tag.html
        token_tagging.html
        procedure.html
        registry.html
      - cli.html
  - _images
  - developer_guide.html
  - bist_parser.html
  - sparse_gnmt.html
  - tagging
    - ner_crf.html
    - ner.html
    - chunker.html
    - sequence_tagging.html
  - generated_api
    - nlp_architect.pipelines.html
    - nlp_architect.data.cdc_resources.wordnet.html
    - nlp_architect.data.cdc_resources.relations.html
    - nlp_architect.data.cdc_resources.data_types.html
    - nlp_architect.data.cdc_resources.data_types.wiki.html
    - nlp_architect.data.cdc_resources.html
    - nlp_architect.models.bist.html
    - nlp_architect.nn.tensorflow.python.keras.layers.html
    - nlp_architect.nn.torch.html
    - nlp_architect.common.html
    - nlp_architect.nn.html
    - nlp_architect.nn.torch.data.html
    - nlp_architect.models.absa.html
    - nlp_architect.models.cross_doc_coref.system.sieves.html
    - nlp_architect.nn.tensorflow.python.html
    - nlp_architect.models.html
    - nlp_architect.models.bist.eval.html
    - nlp_architect.data.cdc_resources.wikipedia.html
    - nlp_architect.utils.resources.html
    - nlp_architect.nn.tensorflow.python.keras.utils.html
    - nlp_architect_api_index.html
    - nlp_architect.data.cdc_resources.gen_scripts.html
    - nlp_architect.procedures.html
    - nlp_architect.common.cdc.html
    - nlp_architect.procedures.transformers.html
    - nlp_architect.data.cdc_resources.embedding.html
    - nlp_architect.models.bist.eval.conllu.html
    - nlp_architect.models.transformers.html
    - nlp_architect.data.cdc_resources.data_types.wn.html
    - nlp_architect.api.html
    - nlp_architect.data.html
    - nlp_architect.cli.html
    - nlp_architect.nn.torch.modules.html
    - nlp_architect.models.cross_doc_coref.system.html
    - nlp_architect.nn.torch.layers.html
    - nlp_architect.utils.html
    - nlp_architect.nn.tensorflow.html
    - nlp_architect.nlp.html
    - nlp_architect.nn.tensorflow.python.keras.html
    - nlp_architect.models.absa.inference.html
    - nlp_architect.models.cross_doc_coref.html
  - absa_solution.html
  - .buildinfo
  - lm.html
  - trend_analysis.html
  - objects.inv
  - model_zoo.html
  - term_set_expansion.html
  - identifying_semantic_relation.html
  - writing_tests.html
  - py-modindex.html
  - transformers.html
  - spacy_np_annotator.html
  - index.html
  - absa.html
  - search.html
  - quantized_bert.html
  - sentiment.html
  - _static
    - ajax-loader.gif
    - jquery.js
    - install.js
    - basic.css
    - pygments.css
    - searchtools.js
    - nlp_arch_theme.css
    - documentation_options.js
    - language_data.js
    - doctools.js
    - fonts
      - fontawesome-webfont.woff2
      - fontawesome-webfont.woff
      - fontawesome-webfont.eot
      - RobotoSlab-Bold.ttf
      - Lato
        lato-bold.woff2
        lato-bolditalic.woff2
        lato-italic.woff2
        lato-regular.woff2
      - RobotoSlab
        roboto-slab-v7-bold.woff2
        roboto-slab-v7-regular.eot
        roboto-slab-v7-bold.ttf
        roboto-slab-v7-bold.eot
        roboto-slab-v7-regular.woff
        roboto-slab-v7-bold.woff
        roboto-slab-v7-regular.ttf
        roboto-slab-v7-regular.woff2
      - Inconsolata.ttf
      - Inconsolata-Bold.ttf
      - RobotoSlab-Regular.ttf
      - fontawesome-webfont.ttf
      - Inconsolata-Regular.ttf
    - websupport.js
    - underscore-1.3.1.js
    - js
      - theme.js
      - modernizr.min.js
    - underscore.js
    - css
      - badge_only.css
      - theme.css
- MANIFEST.in

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

# pylint: disable=global-statement,redefined-outer-name

import argparse
import csv
import math
import os
from multiprocessing import Pool

import feature_extraction as fe
import numpy
from tqdm import tqdm

from nlp_architect.utils.io import (
    validate_existing_filepath,
    validate_parent_exists,
    validate_proxy_path,
)

wordnet = None
wikidata = None
word2vec = None


def build_feature_vector(np):
    """
    Build a feature vector for the given noun-phrase. the size of the
    vector = (4 + (#WORDS X 300)) = 1506. ==> (the number of words in a noun-phrase X size
    of the word2vec(300)) + 4 additional features from external sources

    Args:
        np (str): a noun-phrase

    Returns:
        :obj:`np.ndarray`: the feature vector of the np
    """
    feature_vector = []
    # 1. find if np exist as an entity in WordNet
    wordnet_feature = find_wordnet_entity(np)
    feature_vector.append(wordnet_feature)
    # 2. find if np exist as an entity in Wikidata
    wikidata_feature = find_wikidata_entity(np)
    feature_vector.append(wikidata_feature)
    # 3. word2vec score: from
    word2vec_distance = word2vec.get_similarity_score(np)
    feature_vector.append(word2vec_distance)
    for w in np.split(" "):
        feature_vector.extend(word2vec.get_word_embedding(w))
    return numpy.array(feature_vector)


def find_wordnet_entity(np):
    """
    extract WordNet indicator-feature (1 if exist in WordNet, else 0)

    Args:
        np (str): a noun-phrase

    Returns:
        int: 1 if exist in WordNet, else 0
    """
    candidates = expand_np_candidates(np, True)
    return wordnet.find_wordnet_existence(candidates)


def find_wikidata_entity(np):
    """
    extract Wikidata indicator-feature (1 if exist in Wikidata, else 0)

    Args:
        np (str): a noun-phrase

    Returns:
        int: 1 if exist in Wikidata, else 0
    """
    candidates = expand_np_candidates(np, True)
    return wikidata.find_wikidata_existence(candidates)


def expand_np_candidates(np, stemming):
    """
    Create all case-combination of the noun-phrase (nyc to NYC, israel to Israel etc.)

    Args:
        np (str): a noun-phrase
        stemming (bool): True if to add case-combinations of noun-phrases's stem

    Returns:
        list(str): All case-combination of the noun-phrase
    """
    candidates = []
    # create all case-combinations of np-> nyc to NYC, israel to Israel etc.
    candidates.extend(get_all_case_combinations(np))
    if stemming:
        # create all case-combinations of np's stem-> t-shirts to t-shirt etc.
        # pylint: disable=no-member
        candidates.extend(get_all_case_combinations(fe.stem(np)))
    return candidates


def get_all_case_combinations(np):
    """
    Returns all case combinations for the noun-phrase (regular, upper, lower, title)
    Args:
        np (str): a noun-phrase
    Returns:
        list(str): List of all case combinations
    """
    candidates = [np, np.upper(), np.lower(), np.title()]
    return candidates


def write_to_csv(output, np_feature_vectors, np_dic, np_list):
    """
    Write data to csv file

    Args:
        output (str): output file path
        np_feature_vectors (:obj:`np.ndarray`): numpy vectors
        np_dic (dict): dict, keys: the noun phrase, value: the features
        np_list (list): features list
    """
    with open(output, "w", encoding="utf-8") as out_file:
        writer = csv.writer(out_file, delimiter=",", quotechar='"')
        print("prepared data CSV file is saved in {0}".format(output))
        for i, _ in enumerate(np_feature_vectors):
            np_vector = np_feature_vectors[i]
            np_vector = numpy.append(np_vector, np_dic[np_list[i]])
            writer.writerow(np_vector)


def prepare_data(data_file, output_file, word2vec_file, http_proxy=None, https_proxy=None):
    """
    Extract for each noun-phrase a feature vector (W2V, WordNet, Wikidata, NPMI, UCI).
    Write the feature vectors to --output specifies local path

    Args:
        data_file(str): file_path to input data
        output_file(str): file_path to output processed data
        word2vec_file(str): file_path to word2vec model
        http_proxy(str): http_proxy
        https_proxy(str): https_proxy
    """
    # init_resources:
    global wordnet, wikidata, word2vec
    # pylint: disable=no-member
    wordnet = fe.Wordnet()
    # pylint: disable=no-member
    wikidata = fe.Wikidata(http_proxy, https_proxy)
    print("Start loading Word2Vec model (this might take a while...)")
    # pylint: disable=no-member
    word2vec = fe.Word2Vec(word2vec_file)
    print("Finish loading feature extraction services")
    reader_list = read_csv_file_data(data_file)
    np_dic = {}
    np_list = []
    for row in reader_list:
        np_dic[row[0]] = row[1]
        np_list.append(row[0])
    p = Pool(10)
    np_feature_vectors = list(
        tqdm(p.imap(build_feature_vector, np_list), total=len(np_list))
    )  # , desc="np feature extraction status"))
    write_to_csv(output_file, np_feature_vectors, np_dic, np_list)


def read_csv_file_data(input_path):
    """
    Read csv file to a list

    Args:
        input_path (str): read csv file from this local file path

    Returns:
        list(str): A list where each item is a row in the csv file
    """
    # 1. read csv file
    if not os.path.isabs(input_path):
        # handle case using default value\relative paths
        input_path = os.path.join(os.path.dirname(__file__), input_path)
    with open(input_path, "r", encoding="utf-8-sig") as input_file:
        reader = csv.reader((line.replace("\0", "") for line in input_file))
        reader_list = list(reader)
    return reader_list


def extract_y_labels(input_path):
    """
    Extract only the Labels of the data

    Args:
        input_path (str): read csv file from this local file path

    Returns:
        :obj:`np.ndarray`: A numpy array of the labels, each item is the label of the row
    """
    reader_list = read_csv_file_data(input_path)
    Y_labels_vec = []
    cntr = 0
    for line in reader_list:
        Y_label = line[-1]
        Y_labels_vec.insert(cntr, Y_label)
        cntr += 1
    y_train = Y_labels_vec
    y_train = numpy.array(y_train, dtype=numpy.int32)
    return y_train


class NpSemanticSegData:
    """
    Dataset for NP Semantic Segmentation Model

        Args:
            file_path (str): read data from this local file path

            train_to_test_ratio (:obj:`float`): the train-to-test ration of the dataset

            feature_vec_dim (:obj:`int`): the size of the feature vector for each noun-phrase
    """

    def __init__(self, file_path, train_to_test_ratio=0.8, feature_vec_dim=603):
        self.file_path = file_path
        self.feature_vec_dim = feature_vec_dim
        self.train_to_test_ratio = train_to_test_ratio
        self.is_y_labels = None
        self.y_labels = None
        self.data_set = self.load_data_to_array_iterator()

    def load_data_from_file(self):
        """
        Loads data from file_path to X_train, y_train, X_test, y_test numpy arrays

        Returns:
            tuple(:obj:`np.ndarray`): X_train, y_train, X_test, y_test numpy arrays
        """
        reader_list = read_csv_file_data(self.file_path)
        # count num of feature vectors
        num_feats = len(reader_list)
        # is_y_labels is for inference - if the inference data is labeled y_labels are extracted
        self.is_y_labels = len(reader_list[0]) == self.feature_vec_dim + 1
        X_feature_matrix = numpy.zeros((num_feats, self.feature_vec_dim))
        Y_labels_vec = []
        cntr = 0
        for line in reader_list:
            X_features = numpy.array(line[: self.feature_vec_dim])
            X_feature_matrix[cntr, :] = X_features

            if self.is_y_labels:
                Y_label = line[self.feature_vec_dim]
                Y_labels_vec.insert(cntr, Y_label)
            cntr += 1

        len_train = int(math.floor(num_feats * self.train_to_test_ratio))

        X_train = X_feature_matrix[0 : len_train - 1]
        y_train = None
        if self.is_y_labels:
            y_train = Y_labels_vec[0 : len_train - 1]
            y_train = numpy.array(y_train, dtype=numpy.int32)

        X_test = X_feature_matrix[len_train:]
        y_test = None
        if self.is_y_labels:
            y_test = Y_labels_vec[len_train:]
            y_test = numpy.array(y_test, dtype=numpy.int32)
        return X_train, y_train, X_test, y_test

    def load_data_to_array_iterator(self):
        """
        Load data into dict of 'train' and 'test' datasets

        Returns:
            dict: dict with train set & test_set (each is dict with X and y)
        """
        X_train, y_train, X_test, y_test = self.load_data_from_file()
        data_set = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}
        return data_set

    @property
    def train_set(self):
        """dict(:obj:`numpy.ndarray`): train set (X & y)"""
        return self.data_set["train"]

    @property
    def train_set_x(self):
        """dict(:obj:`numpy.ndarray`): train set (X)"""
        return self.data_set["train"]["X"]

    @property
    def train_set_y(self):
        """dict(:obj:`numpy.ndarray`): train set (y)"""
        return self.data_set["train"]["y"]

    @property
    def test_set(self):
        """dict(:obj:`numpy.ndarray`): test set (X & y)"""
        return self.data_set["test"]

    @property
    def test_set_x(self):
        """dict(:obj:`numpy.ndarray`): test set (X)"""
        return self.data_set["test"]["X"]

    @property
    def test_set_y(self):
        """dict(:obj:`numpy.ndarray`): test set (y)"""
        return self.data_set["test"]["y"]


def absolute_path(input_path):
    """
    Return input_path's absolute path

    Args:
        input_path(str): input_path

    Returns:
        str: absolute path
    """
    if isinstance(input_path, str):
        if not os.path.isabs(input_path):
            # handle case using default value\relative paths
            input_path = os.path.join(os.path.dirname(__file__), input_path)
    return input_path


if __name__ == "__main__":
    # parse the command line arguments
    parser = argparse.ArgumentParser(description="Prepare data")
    parser.add_argument(
        "--data",
        type=validate_existing_filepath,
        help="path the CSV file where the raw dataset is saved",
    )
    parser.add_argument(
        "--output",
        type=validate_parent_exists,
        help="path the CSV file where the prepared dataset will be saved",
    )
    parser.add_argument(
        "--w2v_path", type=validate_existing_filepath, help="path to the word embedding's model"
    )
    parser.add_argument(
        "--http_proxy", help="system's http proxy", type=validate_proxy_path, default=None
    )
    parser.add_argument(
        "--https_proxy", help="system's https proxy", type=validate_proxy_path, default=None
    )
    args = parser.parse_args()
    data_path = absolute_path(args.data)
    word2vec_path = args.w2v_path
    output_path = absolute_path(args.output)
    http_proxy = args.http_proxy
    https_proxy = args.https_proxy
    prepare_data(data_path, output_path, word2vec_path, http_proxy, https_proxy)