python source code of find_full_text

indra-master
- indra
  - preassembler
    - custom_preassembly.py
    - grounding_mapper
      - gilda.py
      - analysis.py
      - adeft.py
      - __init__.py
      - mapper.py
    - sitemapper.py
    - __init__.py
  - belief
    - __init__.py
    - wm_scorer.py
  - resources
    - default_config.ini
    - curated_site_map.csv
    - uniprot_subcell_loc.tsv
    - ras_pathway_proteins.csv
    - ncit_allele_map.tsv
    - famplex
      - grounding_map.csv
      - gene_prefixes.csv
      - entities.csv
      - equivalences.csv
      - relations.csv
    - amino_acids.tsv
    - phosphatases.tsv
    - README.txt
    - famplex_map.tsv
    - greek_alphabet.py
    - statements_schema.json
    - grounding
      - grounding_map.csv
      - agents.json
      - misgrounding_map.csv
      - covid_grounding.csv
      - ignore.csv
    - default_belief_probs.json
    - transcription_factors.csv
    - index_card_schema.json
    - kinases.tsv
    - source_mapping.json
    - update_resources.py
    - hmdb_to_chebi.tsv
  - explanation
    - pathfinding
      - util.py
      - __init__.py
      - pathfinding.py
    - reporting.py
    - __init__.py
    - model_checker
      - signed_graph.py
      - model_checker.py
      - unsigned_graph.py
      - pysb.py
      - pybel.py
      - __init__.py
  - config.py
  - sources
    - geneways
      - find_full_text_sentence.py
      - symbols_parser.py
      - api.py
      - action_parser.py
      - __init__.py
      - actionmention_parser.py
      - processor.py
    - virhostnet
      - api.py
      - __init__.py
      - processor.py
    - ndex_cx
      - api.py
      - __init__.py
      - processor.py
    - medscan
      - fix_csxml_character_encoding.py
      - api.py
      - __init__.py
      - processor.py
    - hypothesis
      - api.py
      - __init__.py
      - processor.py
    - ctd
      - api.py
      - __init__.py
      - processor.py
    - hprd
      - api.py
      - __init__.py
      - processor.py
    - trips
      - client.py
      - drum_reader.py
      - api.py
      - __init__.py
      - analyze_ekbs.py
      - processor.py
    - hume
      - api.py
      - __init__.py
      - visualize_causal.py
      - make_hume_tsv.py
      - processor.py
    - cwms
      - api.py
      - __init__.py
      - processor.py
    - reach
      - reach_rule_regexps.txt
      - reader.py
      - api.py
      - __init__.py
      - processor.py
    - tas
      - api.py
      - __init__.py
      - processor.py
    - signor
      - api.py
      - __init__.py
      - processor.py
    - indra_db_rest
      - util.py
      - exceptions.py
      - api.py
      - __init__.py
      - processor.py
    - sofia
      - make_sofia_tsv.py
      - api.py
      - __init__.py
      - processor.py
    - lincs_drug
      - api.py
      - __init__.py
      - processor.py
    - eidos
      - cli.py
      - client.py
      - migration_table_processor.py
      - server.py
      - reader.py
      - api.py
      - __init__.py
      - processor.py
    - isi
      - experiments.py
      - preprocessor.py
      - api.py
      - __init__.py
      - isi_verb_to_indra_statement_type.tsv
      - processor.py
    - biogrid.py
    - __init__.py
    - rlimsp
      - api.py
      - __init__.py
      - processor.py
    - tees
      - api.py
      - parse_tees.py
      - __init__.py
      - processor.py
    - sparser
      - xml_processor.py
      - api.py
      - __init__.py
      - processor.py
    - bel
      - api.py
      - __init__.py
      - processor.py
    - biopax
      - api.py
      - pathway_commons_client.py
      - __init__.py
      - jars
      - processor.py
    - phosphoelm
      - api.py
      - __init__.py
      - phosphoelm_mapping.py
      - processor.py
    - drugbank
      - api.py
      - __init__.py
      - processor.py
    - index_cards
      - __init__.py
      - processor.py
    - trrust
      - api.py
      - __init__.py
      - processor.py
  - literature
    - s3_client.py
    - crossref_client.py
    - newsapi_client.py
    - pubmed_client.py
    - __init__.py
    - pmc_client.py
    - biorxiv_client.py
    - elsevier_client.py
    - adeft_tools.py
  - util
    - statement_presentation.py
    - perm_cache.py
    - nested_dict.py
    - aws.py
    - __init__.py
    - get_version.py
    - multiprocessing_traceback.py
    - plot_formatting.py
    - repickle_stmts.py
    - _require_python3.py
  - benchmarks
    - phosphorylations
      - __init__.py
    - benchmark_db_coverage.py
    - benchmark_trips.py
    - assembly_eval
      - combine4
        run_combined.py
        analyze4way.py
      - batch4
        run_combined.py
        pmc_batch_4_id_map.txt
        pmcids.txt
        trips_grounding_map.csv
        run_trips_eval.py
        assembly_eval.py
        README.txt
        rasmodel.py
        run_reach_eval.py
        initialize_folders.sh
    - __init__.py
    - bioprocesses
      - __main__.py
      - __init__.py
  - java_vm.py
  - statements
    - resources.py
    - util.py
    - delta.py
    - concept.py
    - io.py
    - __init__.py
    - agent.py
    - statements.py
    - context.py
    - evidence.py
  - ontology
    - bio
      - __main__.py
      - __init__.py
      - ontology.py
    - standardize.py
    - ontology_graph.py
    - virtual
      - __init__.py
      - ontology.py
    - __init__.py
    - world
      - __init__.py
      - ontology.py
    - app
      - app.py
      - __init__.py
  - __init__.py
  - tests
    - kappy_influence.json
    - test_trips.py
    - test_crossref_client.py
    - make_mock_ontology.py
    - merged_BRCA1_formatted.cx
    - test_s3_client.py
    - test_small.xml
    - sofia_test.json
    - sofia_infl_polarities.json
    - util.py
    - test_virhostnet.py
    - test_sif_assembler.py
    - test_cbio_client.py
    - test_trrust.py
    - test_index_card_assembler.py
    - eidos_correlation.json
    - reach_conversion.json
    - test_kappa_util.py
    - test_phosphoelm_processor.py
    - test_preassembler.py
    - test_docs_code.py
    - pmc_cont_example.nxml
    - test_cag_assembler.py
    - test_eidos.py
    - test_english_assembler.py
    - eidos_geoid.json
    - test_literature.py
    - test_elsevier_client.py
    - wm_ben_event_sentences.v1.json-ld
    - eidos_timex.json
    - test_reach.py
    - test_lincs_drug.py
    - hprd_tests_data
      - PROTEIN_SEQUENCES.txt
      - BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt
      - PROTEIN_COMPLEXES.txt
      - HPRD_ID_MAPPINGS.txt
      - POST_TRANSLATIONAL_MODIFICATIONS.txt
    - test_indranet_assembler.py
    - test_ndex_cx_processor.py
    - bbn_test_negatedCause.json-ld
    - test_sitemapper.py
    - test_expand_families.py
    - test_db_rest.py
    - rlimsp_site.json
    - test_rest_api.py
    - hackathon_test_paragraph.json-ld
    - eidos_neg_hedge.json
    - test_cwms.py
    - sofia_event_decreased.json
    - test_delta.py
    - im_polarity.dot
    - test_mesh.py
    - test_rlimsp.py
    - test_context.py
    - test_taxonomy_client.py
    - test_isi.py
    - test_lincs_client.py
    - geneways_tests_data
      - human_actionmention.txt
      - human_symbols.txt
      - README
      - human_action.txt
    - korkut_stmts.sif
    - test_util.py
    - eidos_coref.json
    - test_tsv_assembler.py
    - test_mirbase_client.py
    - test_html_assembler.py
    - test_assemble_corpus.py
    - eidos_geoloc_obj.json
    - medscan_tests_data
      - test_Phosphorylate.csxml
      - test_Protein_Mutation.csxml
      - test_Binding.csxml
      - test_MolSynthesis-negative.csxml
      - test_ExpressionControl_positive.csxml
      - test_Inhibition.csxml
      - test_modification_site.csxml
      - test_Protein_MethSite.csxml
      - test_Activation.csxml
      - test_MolSynthesis-positive.csxml
      - test_duplicate_SVO.csxml
      - test_ExpressionControl_negative.csxml
      - test_Dephosphorylate.csxml
      - test_Protein_PhosphoSite.csxml
    - test_databases.py
    - cwms_tests_data
      - example_2_sentence_3.rdf
      - cause_increase_event.ekb
      - cause_decrease_event.ekb
      - ekb_processing_test.ekb
      - example_2_sentence_4.rdf
      - example_2_sentence_1.txt
      - migration_sentence1.ekb
      - example_2_sentence_1.rdf
      - example_2_sentence_3.txt
      - association.ekb
      - cwms_increase.ekb
      - example_2_sentence_4.txt
      - migration_sentence2.ekb
    - test_belief_engine.py
    - test_tees.py
    - test_chebi_client.py
    - test_groundingmapper.py
    - test_sparser.py
    - test_cyjs_assembler.py
    - wm_m12.ben_sentence.json-ld
    - pipeline_test.json
    - test_mechlinker.py
    - kappy_contact.json
    - bbn_test_simple.json-ld
    - test_hypothesis.py
    - test_ndex_client.py
    - test_hprd.py
    - trips_ekbs
      - Vemurafenib_leads_to_the_deactivation_of_MAPK1.ekb
      - The_EGFR_EGF_complex_binds_another_EGFR_EGF_complex.ekb
      - EGF_leads_to_the_activation_of_MAPK1.ekb
      - EGF_and_NGF_lead_to_the_phosphorylation_of_MAPK1.ekb
      - make_trips_ekbs.py
      - FAKEPROTEIN.ekb
      - HRAS_and_KRAS_convert_GTP_into_GDP.ekb
      - SOS1_bound_NRAS_binds_GTP.ekb
      - MAP2K1_ribosylates_MAPK1.ekb
      - trips_ekb_sentences.txt
      - mek1.ekb
      - DUSP6_dephosphorylates_ERK2.ekb
      - MAP2K1_deribosylates_MAPK1.ekb
      - GTP_bound_NRAS_that_is_not_bound_to_SOS1_binds_BRAF.ekb
      - MEK_not_bound_to_Selumetinib_phosphorylates_ERK.ekb
      - Activated_TGFBR1_phosphorylates_SMURF2.ekb
      - GRB2_bound_SOS1_binds_NRAS_that_is_not_bound_to_BRAF.ekb
      - estrogen_receptor_alpha.ekb
      - Phosphorylated_MAP2K1_is_activated.ekb
      - MAP2K1_hydroxylates_MAPK1.ekb
      - MAP2K1_ubiquitinates_MAPK1.ekb
      - ekb_assoc.ekb
      - egfr_protein.ekb
      - Phosphorylated_ERK2_is_activated.ekb
      - MAPK1_is_degraded.ekb
      - MAP2K1_dehydroxylates_MAPK1.ekb
      - p53_increases_the_transcription_of_mdm2.ekb
      - BRAF_V600E_that_is_not_bound_to_Vemurafenib_phosphorylates_MAP2K1.ekb
      - Vemurafenib_binds_BRAF.ekb
      - NFKB_synthesizes_IKB_in_the_nucleus.ekb
      - MAP2K1_farnesylates_MAPK1.ekb
      - p53_decreases_the_transcription_of_mdm2.ekb
      - MAP2K1_dephosphorylates_MAPK1.ekb
      - p53_downregulates_the_transcription_of_mdm2.ekb
      - Unphosphorylated_ERK_is_degraded.ekb
      - TCRA_activates_NEDD4__MEK1__CK2__PIP3_and_mTORC2.ekb
      - Phosphorylated_ERK_is_active.ekb
      - Ras_converts_GTP_into_GDP.ekb
      - PP2A_alpha_dephosphorylates_MAP2K1_that_is_not_bound_to_ERK2.ekb
      - EGFR_bound_GRB2_binds_SOS1.ekb
      - MAP2K1_degrades_MAPK1.ekb
      - MEK_increases_the_phosphorylation_of_ERK.ekb
      - PTEN_catalyzes_the_conversion_of_PIP3_to_PIP2.ekb
      - RAS_converts_GTP_into_GDP_and_GMP.ekb
      - p53_positively_regulates_the_transcription_of_mdm2.ekb
      - Ras_activated_SAF_1_that_binds_to_a_bona_fide_SAF_1_binding_element.ekb
      - RAF1_activates_MAP2K1.ekb
      - GTP_is_converted_into_GDP.ekb
      - MAP2K1_deubiquitinates_MAPK1.ekb
      - MAPK1_translocates_from_the_plasma_membrane_to_the_nucleus.ekb
      - MAP2K1_synthesizes_MAPK1.ekb
      - MAP2K1_phosphorylates_MAPK1.ekb
      - EGFR_autophosphorylates_itself_on_Y1234.ekb
      - p53_increases_mdm2.ekb
      - increase_amount_qty.ekb
      - MAP2K1_acetylates_MAPK1.ekb
      - MAP2K1_defarnesylates_MAPK1.ekb
      - MAPK1_translocates_to_the_nucleus.ekb
      - MAPK1_is_synthesized.ekb
      - Active_MAP2K1_that_is_not_bound_to_PP2A_alpha_phosphorylates_ERK2.ekb
      - endoplasmic_reticulum.ekb
      - EGFR_bound_to_EGFR_transphosphorylates_itself.ekb
      - EGF_leads_to_the_phosphorylation_of_ERK.ekb
      - MAP2K1_transcribes_MAPK1.ekb
      - MAPK1_translocates_from_the_nucleus.ekb
      - Stimulation_by_EGF_activates_MAPK1.ekb
      - FGF2_activates_PI3K_Akt_mTOR_and_MAPK_ERK.ekb
      - Ubiquitinated_MAPK1_is_degraded.ekb
      - EGFR_autophosphorylates_itself.ekb
      - MAP2K1_deacetylates_MAPK1.ekb
      - chebi_id_test.ekb
      - The_EGFR_EGFR_complex_binds_GRB2.ekb
      - cfos_gene.ekb
      - The_receptor_tyrosine_kinase_EGFR_binds_the_growth_factor_ligand_EGF.ekb
    - test_figaro_assembler.py
    - test_drugbank.py
    - signor_test_data.csv
    - test_signor.py
    - test_statements.py
    - test_hume.py
    - test_incremental_model.py
    - test_pybel_assembler.py
    - test_medscan.py
    - test_pubchem_client.py
    - test_statements_serialization.py
    - test_pubmed_client.py
    - __init__.py
    - test_tas.py
    - bbn_test_negatedEffect.json-ld
    - test_uniprot_client.py
    - test_biogrid.py
    - drugbank_sample.xml
    - test_model_checker.py
    - eidos_test.jsonld
    - test_pipeline.py
    - reach_act_amt.json
    - eidos_neg_event.json
    - wm_migration_numeric_one_sentence.082019.json-ld
    - test_pmc_client.py
    - test_live_curation.py
    - test_ontology.py
    - reach_coordinates.json
    - signor_test_complexes.csv
    - test_pybel_api.py
    - test_pathfinding.py
    - test_sbgn_assembler.py
    - test_geneways.py
    - test_graph_assembler.py
    - test_custom_preassembly.py
    - test_pysb_assembler.py
    - test_trips_ekbs.py
    - test_go_client.py
    - test_adeft_tools.py
    - eidos_standalone_event.json
    - test_obo_clients
      - test_efo_client.py
      - test_doid_client.py
      - test_hp_client.py
      - __init__.py
    - test_json_schema.py
    - test_bmi_wrapper.py
    - test_sofia.py
    - test_benchmarks.py
    - test_hgnc_client.py
    - test_kami_assembler.py
    - test_biopax.py
    - test_cx_assembler.py
    - test_sparser_xml.py
    - test_chembl_client.py
    - test_rasmachine.py
    - biogrid_tests_data
      - biogrid_test.txt
  - tools
    - mechlinker_queries.py
    - incremental_model.py
    - machine
      - twitter_client.py
      - config.py
      - cli.py
      - default-config.yaml
      - __main__.py
      - __init__.py
      - gmail_client.py
      - machine.py
    - extract_grounding_map.py
    - assemble_corpus.py
    - live_curation
      - util.py
      - live_curation.py
      - __init__.py
      - curator.py
      - corpus.py
    - expand_families.py
    - __init__.py
    - gene_network.py
    - executable_subnetwork.py
  - pipeline
    - decorators.py
    - pipeline.py
    - __init__.py
  - databases
    - hp_client.py
    - chembl_client.py
    - drugbank_client.py
    - efo_client.py
    - context_client.py
    - mirbase_client.py
    - mesh_client.py
    - doid_client.py
    - cbio_client.py
    - pubchem_client.py
    - chebi_client.py
    - obo_client.py
    - hgnc_client.py
    - uniprot_client.py
    - go_client.py
    - ndex_client.py
    - __init__.py
    - lincs_client.py
    - taxonomy_client.py
  - mechlinker
    - __init__.py
  - assemblers
    - graph
      - assembler.py
      - __init__.py
    - figaro
      - assembler.py
      - __init__.py
    - html
      - assembler.py
      - templates
        indra
        template.html
        statements_view.html
      - __init__.py
    - cx
      - assembler.py
      - hub_layout.py
      - __init__.py
    - cyjs
      - assembler.py
      - __init__.py
    - tsv
      - assembler.py
      - __init__.py
    - pybel
      - assembler.py
      - __init__.py
    - indranet
      - assembler.py
      - net.py
      - __init__.py
    - __init__.py
    - pysb
      - bmi_wrapper.py
      - assembler.py
      - export.py
      - kappa_util.py
      - preassembler.py
      - common.py
      - __init__.py
      - base_agents.py
      - sites.py
    - index_card
      - assembler.py
      - __init__.py
    - cag
      - assembler.py
      - __init__.py
      - cag_style.json
      - cag_template.js
    - kami
      - assembler.py
      - __init__.py
    - sbgn
      - assembler.py
      - __init__.py
    - english
      - assembler.py
      - __init__.py
    - sif
      - assembler.py
      - __init__.py
- _config.yml
- rest_api
  - api.py
  - __init__.py
- .gitmodules
- CNAME
- LICENSE
- models
  - ras_pathway
    - ras_pathway.txt
    - run_ras_boolnet.py
    - cyjs
      - style.css
      - base_script.js
      - index.html
    - CyJS_Assembler.ipynb
    - run_ras_pathway.py
    - correction.txt
    - extension.txt
  - braf_model
    - model2_from1.txt
    - assemble_model.py
    - braf_rxn_analysis.py
    - run_model.py
    - model1.txt
    - model3_from2.txt
  - p53_model
    - ATM_v4a.txt
    - ATR_v1.txt
    - ATM_v2.txt
    - param_analysis.py
    - ATR_v2.txt
    - ATR_v3.txt
    - ATM_v4b.txt
    - run_p53_model.py
    - ATM_v3.txt
    - ATM_v1.txt
    - POMI_tutorial.ipynb
  - indra_statements_demo.ipynb
  - hello_indra.py
  - worldmodel
    - text_to_model.ipynb
- CONTRIBUTING.md
- .gitattributes
- setup.py
- setup.cfg
- .travis.yml
- README.md
- .gitignore
- MANIFEST.in
- doc
  - Makefile
  - make.bat
  - images
  - getting_started.rst
  - rest_api.rst
  - tutorials
    - nl_modeling.rst
    - html_curation.rst
    - images
    - wm_service.rst
    - gene_network.rst
    - index.rst
  - license.rst
  - ext
    - citations.py
  - requirements.txt
  - index.rst
  - conf.py
  - modules
    - preassembler
      - preassembler.rst
      - grounding_mapper.rst
      - index.rst
      - site_mapper.rst
    - belief
      - index.rst
    - explanation
      - pathfinding.rst
      - model_checker.rst
      - index.rst
    - sources
      - geneways
        index.rst
      - virhostnet
        index.rst
      - ndex_cx
        index.rst
      - medscan
        index.rst
      - hypothesis
        index.rst
      - ctd
        index.rst
      - hprd
        index.rst
      - trips
        index.rst
      - hume
        index.rst
      - cwms
        index.rst
      - reach
        index.rst
      - tas
        index.rst
      - signor
        index.rst
      - indra_db_rest
        index.rst
      - sofia
        index.rst
      - lincs_drug
        index.rst
      - eidos
        index.rst
      - isi
        index.rst
      - biogrid
        index.rst
      - rlimsp
        index.rst
      - tees
        index.rst
      - sparser
        index.rst
      - bel
        index.rst
      - index.rst
      - biopax
        index.rst
      - phosphoelm
        index.rst
      - drugbank
        index.rst
      - trrust.rst
    - literature
      - index.rst
    - util
      - index.rst
    - statements.rst
    - ontology
      - standardize.rst
      - virtual_ontology.rst
      - ontology_service.rst
      - ontology.rst
      - world_ontology.rst
      - index.rst
      - bio_ontology.rst
    - tools
      - index.rst
    - index.rst
    - databases
      - index.rst
    - mechlinker
      - index.rst
    - assemblers
      - graph_assembler.rst
      - indranet_assembler.rst
      - index_card_assembler.rst
      - cag_assembler.rst
      - sbgn_assembler.rst
      - cx_assembler.rst
      - bmi_wrapper.rst
      - english_assembler.rst
      - tsv_assembler.rst
      - kami_assembler.rst
      - sif_assembler.rst
      - cyjs_assembler.rst
      - html_assembler.rst
      - index.rst
      - pysb_assembler.rst
      - pybel_assembler.rst
    - pipeline.rst
  - installation.rst
- tox.ini

from indra.literature import *
from indra.sources.geneways.actionmention_parser import \
        GenewaysActionMentionParser
import random
import pickle
import re
from stemming.porter2 import stem
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from lxml import etree

from indra.resources.greek_alphabet import greek_alphabet

class FullTextMention(object):
    """Container for full text mentions and their corresponding full text"""
    def __init__(self, mention, xml_full_text):
        self.mention = mention
        self.xml_full_text = xml_full_text

    def __repr__(self):
        return '%s %s %s' % (self.mention.upstream, self.mention.actiontype,
                self.mention.downstream)

    def write_xml_to_file(self, output_file):
        text_file = open(output_file, "w")
        text_file.write(self.xml_full_text)
        text_file.close()

    def sentence_tokenize(self, text):
        #return text.split('.')
        return sent_tokenize(text)

    def get_sentences(self, root_element, block_tags):
        """Returns a list of plain-text sentences by iterating through
        XML tags except for those listed in block_tags."""
        sentences = []
        for element in root_element:
            if not self.any_ends_with(block_tags, element.tag):
                # tag not in block_tags
                if element.text is not None and not re.match('^\s*$',
                                                             element.text):
                    sentences.extend(self.sentence_tokenize(element.text))
                sentences.extend(self.get_sentences(element, block_tags))

        f = open('sentence_debug.txt', 'w')
        for s in sentences:
            f.write(s.lower() + '\n')
        f.close()
        return sentences

    def any_ends_with(self, string_list, pattern):
        """Returns true iff one of the strings in string_list ends in
        pattern."""
        try:
            s_base = basestring
        except:
            s_base = str
        is_string = isinstance(pattern, s_base)

        if not is_string:
            return False
        for s in string_list:
            if pattern.endswith(s):
                return True

        return False

    def extract_sentences(self, block_tags, strip_tags, remove_tags):
        # Remove these tags from the text
        s_text = self.xml_full_text
        for strip_tag in strip_tags:
            start_tag1 = '<' + strip_tag + '>'
            start_tag2 = '<' + strip_tag + ' [^<>]*>'
            end_tag = '</' + strip_tag + '>'
            s_text = re.sub(start_tag1, '', s_text)
            s_text = re.sub(start_tag2, '', s_text)
            s_text = re.sub(end_tag, '', s_text)

        # Remove these tags and anything in them from the text
        for remove_tag in remove_tags:
            r = '<' + remove_tag + '[^>]*>' + '[^<]*</' + remove_tag + '>'
            s_text = re.sub(r, '', s_text)

        # Convert greek characters to names
        for a in greek_alphabet.keys():
            s_text = s_text.replace(a, greek_alphabet[a])

        f = open('foo.txt', 'w')
        f.write(s_text)
        f.close()

        try:
            root = etree.fromstring(s_text.encode('utf-8'))
            sentences = self.get_sentences(root, block_tags)
        except:
            # If we failed to process xml, that probably means it's actually
            # plain text
            sentences = self.sentence_tokenize(self.xml_full_text)
        return sentences

    def find_matching_sentences(self, block_tags=None, strip_tags=None,
                                remove_tags=None):
        if block_tags is None:
            block_tags = []
        if strip_tags is None:
            strip_tags = ['italic', 'bold', 'sup', 'sub', 'xref']
        if remove_tags is None:
            remove_tags = []

        sentences = self.extract_sentences(block_tags, strip_tags, remove_tags)

        matching_sentences = []
        for sentence in sentences:
            if self.sentence_matches(sentence) or \
                self.sentence_matches(sentence.replace('-', ' ')):
                matching_sentences.append(sentence)

        return matching_sentences

    def get_tag_names(self):
        """Returns the set of tag names present in the XML."""
        root = etree.fromstring(self.xml_full_text.encode('utf-8'))
        return self.get_children_tag_names(root)

    def get_children_tag_names(self, xml_element):
        """Returns all tag names of xml element and its children."""
        tags = set()
        tags.add(self.remove_namespace_from_tag(xml_element.tag))

        for element in xml_element.iter(tag=etree.Element):
            if element != xml_element:
                new_tags = self.get_children_tag_names(element)
                if new_tags is not None:
                    tags.update(new_tags)
        return tags

    def string_matches_sans_whitespace(self, str1, str2_fuzzy_whitespace):
        """Check if two strings match, modulo their whitespace."""
        str2_fuzzy_whitespace = re.sub('\s+', '\s*', str2_fuzzy_whitespace)
        return re.search(str2_fuzzy_whitespace, str1) is not None

    def sentence_matches(self, sentence_text):
        """Returns true iff the sentence contains this mention's upstream
        and downstream participants, and if one of the stemmed verbs in
        the sentence is the same as the stemmed action type."""
        has_upstream = False
        has_downstream = False
        has_verb = False

        # Get the first word of the action type and assume this is the verb
        # (Ex. get depends for depends on)
        actiontype_words = word_tokenize(self.mention.actiontype)
        actiontype_verb_stemmed = stem(actiontype_words[0])

        words = word_tokenize(sentence_text)

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.upstream.lower()):
            has_upstream = True

        if self.string_matches_sans_whitespace(sentence_text.lower(),
            self.mention.downstream.lower()):
            has_downstream = True

        for word in words:
            if actiontype_verb_stemmed == stem(word):
                has_verb = True

        return has_upstream and has_downstream and has_verb