python source code of _unigram

abydos-master
- abydos
  - tokenizer
    - _vc_cluster.py
    - _wordpunct.py
    - _legalipy.py
    - _tokenizer.py
    - _c_or_v_cluster.py
    - _regexp.py
    - _whitespace.py
    - _cv_cluster.py
    - _character.py
    - _saps.py
    - __init__.py
    - _sonoripy.py
    - _q_grams.py
    - _q_skipgrams.py
    - _nltk.py
  - fingerprint
    - _bwtrlef.py
    - _string.py
    - _consonant.py
    - _count.py
    - _lacss.py
    - _omission_key.py
    - _skeleton_key.py
    - _fingerprint.py
    - _occurrence_halved.py
    - _lc_cutter.py
    - _synoname_toolcode.py
    - _extract.py
    - _phonetic.py
    - _qgram.py
    - _occurrence.py
    - __init__.py
    - _extract_position_frequency.py
    - _position.py
    - _bwtf.py
  - phones
    - _phones.py
    - __init__.py
  - corpus
    - _corpus.py
    - _ngram_corpus.py
    - _unigram_corpus.py
    - __init__.py
  - util
    - _data.py
    - __init__.py
    - _ncr.py
    - _prod.py
  - compression
    - _bwt.py
    - _rle.py
    - __init__.py
    - _arithmetic.py
  - __init__.py
  - stemmer
    - _clef_german.py
    - _clef_swedish.py
    - _porter2.py
    - _snowball_german.py
    - _s_stemmer.py
    - _stemmer.py
    - _schinke.py
    - _caumanns.py
    - _clef_german_plus.py
    - _snowball_dutch.py
    - _snowball.py
    - _snowball_swedish.py
    - _lovins.py
    - __init__.py
    - _uea_lite.py
    - _porter.py
    - _paice_husk.py
    - _snowball_danish.py
    - _snowball_norwegian.py
  - stats
    - _pairwise.py
    - _confusion_table.py
    - __init__.py
    - _mean.py
  - phonetic
    - _eudex.py
    - _phonic.py
    - _fuzzy_soundex.py
    - _soundex.py
    - _meta_soundex.py
    - _double_metaphone.py
    - _haase.py
    - _pshp_soundex_first.py
    - _caverphone.py
    - _waahlin.py
    - _phonet.py
    - _sound_d.py
    - _henry_early.py
    - _pshp_soundex_last.py
    - _nrl.py
    - _onca.py
    - _dolby.py
    - _lein.py
    - _beider_morse.py
    - _parmar_kumbharana.py
    - _soundex_br.py
    - _russell_index.py
    - _refined_soundex.py
    - _phonem.py
    - _norphone.py
    - _phonex.py
    - _sfinx_bis.py
    - _phonix.py
    - _roger_root.py
    - _phonetic.py
    - _metaphone.py
    - _alpha_sis.py
    - _daitch_mokotoff.py
    - _phonetic_spanish.py
    - __init__.py
    - _mra.py
    - _ainsworth.py
    - _reth_schek.py
    - _beider_morse_data.py
    - _fonem.py
    - _nysiis.py
    - _koelner.py
    - _spanish_metaphone.py
    - _davidson.py
    - _spfc.py
    - _statistics_canada.py
  - distance
    - _eudex.py
    - _baulieu_viii.py
    - _suffix.py
    - _upholt.py
    - _millar.py
    - _goodall.py
    - _unknown_f.py
    - _kuhns_v.py
    - _forbes_ii.py
    - _fidelity.py
    - _editex.py
    - _levenshtein.py
    - _kuhns_ix.py
    - _unknown_i.py
    - _kuhns_ii.py
    - _russell_rao.py
    - _matusita.py
    - _gotoh.py
    - _braun_blanquet.py
    - _softtf_idf.py
    - _morisita.py
    - _baulieu_x.py
    - _aline.py
    - _mutual_information.py
    - _baulieu_iii.py
    - _kuhns_x.py
    - _sift4.py
    - _soft_cosine.py
    - _unknown_m.py
    - _digby.py
    - _typo.py
    - _kuhns_xii.py
    - _ncd_lzma.py
    - _scott_pi.py
    - _unknown_b.py
    - _goodman_kruskal_lambda.py
    - _chebyshev.py
    - _chao_jaccard.py
    - _ncd_arith.py
    - _baulieu_vii.py
    - _harris_lahey.py
    - _ample.py
    - _rees_levenshtein.py
    - _phonetic_distance.py
    - _rouge_w.py
    - _bag.py
    - _sift4_simplest.py
    - _doolittle.py
    - _average_linkage.py
    - _flexmetric.py
    - _ncd_bwtrle.py
    - _kulczynski_ii.py
    - _phonetic_edit_distance.py
    - _tversky.py
    - _johnson.py
    - _koppen_ii.py
    - _fuzzywuzzy_token_sort.py
    - _shape.py
    - _lcsseq.py
    - _bisim.py
    - _pearson_phi.py
    - _mountford.py
    - _anderberg.py
    - _damerau_levenshtein.py
    - _unknown_l.py
    - _yule_y.py
    - _unknown_c.py
    - _length.py
    - _pearson_chi_squared.py
    - _hamann.py
    - _koppen_i.py
    - _faith.py
    - _quantitative_cosine.py
    - _bleu.py
    - _ncd_lzss.py
    - _meta_levenshtein.py
    - _positional_q_gram_jaccard.py
    - _unigram_subtuple.py
    - _sokal_michener.py
    - _rogot_goldberg.py
    - _dice_asymmetric_ii.py
    - _warrens_iv.py
    - _baulieu_xiv.py
    - _hamming.py
    - _kuhns_vii.py
    - _baulieu_xii.py
    - _gini_ii.py
    - _baulieu_xv.py
    - _kulczynski_i.py
    - _sokal_sneath_iv.py
    - _baulieu_v.py
    - _guttman_lambda_a.py
    - _kent_foster_i.py
    - _tulloss_u.py
    - _maarel.py
    - _benini_i.py
    - _steffensen.py
    - _cole.py
    - _smith_waterman.py
    - _mcewen_michael.py
    - _pearson_heron_ii.py
    - _covington.py
    - _kuhns_xi.py
    - _hassanat.py
    - _discounted_levenshtein.py
    - _fellegi_sunter.py
    - _forbes_i.py
    - _consonni_todeschini_ii.py
    - _unknown_e.py
    - _raup_crick.py
    - _synoname.py
    - _mcconnaughey.py
    - _yule_q.py
    - _brainerd_robinson.py
    - _gilbert_wells.py
    - _baulieu_xi.py
    - _consonni_todeschini_i.py
    - _bennet.py
    - _generalized_fleiss.py
    - _hellinger.py
    - _overlap.py
    - _ms_contingency.py
    - _baulieu_iv.py
    - _ncd_bz2.py
    - _horn_morisita.py
    - _tulloss_t.py
    - _warrens_i.py
    - _goodman_kruskal_tau_a.py
    - _relaxed_hamming.py
    - _tichy.py
    - _distance.py
    - _fossum.py
    - _consonni_todeschini_iii.py
    - _yule_q_ii.py
    - _kuhns_viii.py
    - _marking.py
    - _baroni_urbani_buser_i.py
    - _hurlbert.py
    - _gower_legendre.py
    - _canberra.py
    - _tulloss_r.py
    - _kent_foster_ii.py
    - _isg.py
    - _quantitative_dice.py
    - _cosine.py
    - _indel.py
    - _azzoo.py
    - _baroni_urbani_buser_ii.py
    - _vps.py
    - _warrens_ii.py
    - _unknown_d.py
    - _dispersion.py
    - _lig3.py
    - _size.py
    - _pearson_ii.py
    - _cohen_kappa.py
    - _dunning.py
    - _cao.py
    - _fuzzywuzzy_token_set.py
    - _ncd_rle.py
    - _maxwell_pilliner.py
    - _lcsuffix.py
    - _warrens_iii.py
    - _masi.py
    - _eyraud.py
    - _sorgenfrei.py
    - _sokal_sneath_ii.py
    - _marking_metric.py
    - _q_gram.py
    - _peirce.py
    - _rouge_s.py
    - _minhash.py
    - _ssk.py
    - _consonni_todeschini_iv.py
    - _sokal_sneath_iii.py
    - _monge_elkan.py
    - _baulieu_vi.py
    - _ncd_zlib.py
    - _yates_chi_squared.py
    - _positional_q_gram_overlap.py
    - _ident.py
    - _tarantula.py
    - _goodman_kruskal_tau_b.py
    - _lcprefix.py
    - _prefix.py
    - _positional_q_gram_dice.py
    - _dice_asymmetric_i.py
    - _ozbay.py
    - _iterative_substring.py
    - _dice.py
    - _cormode_lz.py
    - _consonni_todeschini_v.py
    - _jaccard_nm.py
    - _andres_marzo_delta.py
    - _complete_linkage.py
    - _unknown_j.py
    - _chord.py
    - _rouge_su.py
    - _unknown_k.py
    - _rouge_l.py
    - _saps.py
    - __init__.py
    - _mra.py
    - _unknown_g.py
    - _gwet_ac.py
    - _kuder_richardson.py
    - _sift4_extended.py
    - _strcmp95.py
    - _kuhns_i.py
    - _token_distance.py
    - _stiles.py
    - _needleman_wunsch.py
    - _fuzzywuzzy_partial_string.py
    - _single_linkage.py
    - _tarwid.py
    - _shapira_storer_i.py
    - _gini_i.py
    - _sokal_sneath_i.py
    - _baulieu_i.py
    - _fleiss.py
    - _yujian_bo.py
    - _michelet.py
    - _fager_mcgowan.py
    - _henderson_heron.py
    - _rogers_tanimoto.py
    - _tulloss_s.py
    - _dennis.py
    - _sokal_sneath_v.py
    - _guth.py
    - _kendall_tau.py
    - _batagelj_bren.py
    - _inclusion.py
    - _hawkins_dotson.py
    - _jaro_winkler.py
    - _whittaker.py
    - _fleiss_levin_paik.py
    - _lorentzian.py
    - _pattern.py
    - _mlipns.py
    - _guttman_lambda_b.py
    - _quantitative_jaccard.py
    - _warrens_v.py
    - _jaccard.py
    - _kuhns_vi.py
    - _bhattacharyya.py
    - _yjhhr.py
    - _baulieu_xiii.py
    - _clark.py
    - _goodman_kruskal_lambda_r.py
    - _pearson_iii.py
    - _minkowski.py
    - _higuera_mico.py
    - _stuart_tau.py
    - _jensen_shannon.py
    - _baulieu_ix.py
    - _chao_dice.py
    - _lcsstr.py
    - _gilbert.py
    - _roberts.py
    - _manhattan.py
    - _block_levenshtein.py
    - _kuhns_iv.py
    - _ratcliff_obershelp.py
    - _baystat.py
    - _clement.py
    - _baulieu_ii.py
    - _euclidean.py
    - _tf_idf.py
    - _tetrachoric.py
    - _kuhns_iii.py
    - _ncd_paq9a.py
    - _weighted_jaccard.py
    - _benini_ii.py
    - _unknown_h.py
    - _unknown_a.py
- .codeclimate.yml
- .github
  - CODEOWNERS
- .circleci
  - config.yml
- .coveragerc
- requirements-test.txt
- .gitmodules
- badge_update.py
- helpers
  - call_and_write_log.py
  - bm_php2py.py
- LICENSE
- CODE_OF_CONDUCT.rst
- Pipfile
- azure-pipelines.yml
- abydos.xcf
- CODING_STANDARDS.rst
- README.rst
- FAQ.rst
- setup.py
- requirements-dev.txt
- data
  - features
    - features_terms.csv
    - features_csv_to_dict.py
    - features_symbols.csv
- VERSION.rst
- AUTHORS.rst
- stubs
  - syllabipy
    - sonoripy.pyi
    - __init__.pyi
    - legalipy.pyi
  - lzss
    - __init__.pyi
  - paq
    - __init__.pyi
  - numpy
    - __init__.pyi
    - core
      - numeric.pyi
      - __init__.pyi
      - numerictypes.pyi
      - _internal.pyi
- setup.cfg
- HISTORY.rst
- .travis.yml
- .pypirc
- tests
  - tokenizer
    - test_tokenizer_character.py
    - test_tokenizer_q_grams.py
    - test_tokenizer_legalipy.py
    - test_tokenizer__tokenizer.py
    - test_tokenizer_vc_cluster.py
    - test_tokenizer_c_or_v_cluster.py
    - test_tokenizer_nltk.py
    - test_tokenizer_whitespace.py
    - test_tokenizer_regexp.py
    - test_tokenizer_q_skipgrams.py
    - __init__.py
    - test_tokenizer_saps.py
    - test_tokenizer_wordpunct.py
    - test_tokenizer_cv_cluster.py
    - test_tokenizer_sonoripy.py
  - fingerprint
    - test_fingerprint_lacss.py
    - test_fingerprint_phonetic.py
    - test_fingerprint_skeleton_key.py
    - test_fingerprint_extract.py
    - test_fingerprint_consonant.py
    - test_fingerprint_synoname_toolcode.py
    - test_fingerprint_bwtrlef.py
    - test_fingerprint_bwtf.py
    - test_fingerprint_position.py
    - test_fingerprint_count.py
    - test_fingerprint_omission_key.py
    - __init__.py
    - test_fingerprint__fingerprint.py
    - test_fingerprint_occurrence_halved.py
    - test_fingerprint_lc_cutter.py
    - test_fingerprint_extract_position_frequency.py
    - test_fingerprint_occurrence.py
    - test_fingerprint_string.py
    - test_fingerprint_qgram.py
  - corpora
    - uscensus2000.bm.cc.csv
    - simple-ngrams.txt
    - simple-ngrams-pos.txt
    - uea-lite_wsj.csv
    - variantNames.csv
    - fake_words.csv
    - homophones.csv
    - wikipediaCommonMisspellings.csv
    - php_caverphone.csv
    - misspellings.csv
    - nachnamen.bm.cc.csv
  - phones
    - test_phones.py
    - __init__.py
  - corpus
    - test_corpus_corpus.py
    - test_corpus_n_gram_corpus.py
    - __init__.py
    - test_corpus_unigram_corpus.py
  - util
    - test_ncr.py
    - test_data.py
    - test_prod.py
    - __init__.py
  - fuzz
    - corpora
      - blns.txt
      - basewords.txt
    - fuzz_test_tokenizer.py
    - fuzz_test_stemmer.py
    - fuzz_test_fingerprint.py
    - fuzz_test_distance.py
    - fuzz_test_phonetic.py
    - __init__.py
  - compression
    - test_compression_bwt.py
    - test_compression_rle.py
    - __init__.py
    - test_compression_arithmetic.py
  - regression
  - __init__.py
  - stemmer
    - test_stemmer_lovins.py
    - test_stemmer_snowball_german.py
    - test_stemmer_porter.py
    - test_stemmer_schinke.py
    - test_stemmer_paice_husk.py
    - test_stemmer_snowball_norwegian.py
    - test_stemmer__snowball.py
    - test_stemmer_snowball_dutch.py
    - test_stemmer_clef_german_plus.py
    - test_stemmer_snowball_swedish.py
    - test_stemmer_snowball_danish.py
    - test_stemmer_porter2.py
    - test_stemmer_caumanns.py
    - test_stemmer__stemmer.py
    - __init__.py
    - test_stemmer_clef_swedish.py
    - test_stemmer_s_stemmer.py
    - test_stemmer_clef_german.py
    - test_stemmer_uealite.py
  - stats
    - test_stats_confusion_table.py
    - test_stats_mean.py
    - __init__.py
    - test_stats_pairwise.py
  - phonetic
    - test_phonetic_pshp_soundex_first.py
    - test_phonetic_refined_soundex.py
    - test_phonetic_sound_d.py
    - test_phonetic_lein.py
    - test_phonetic_sfinxbis.py
    - test_phonetic_phonem.py
    - test_phonetic_alpha_sis.py
    - test_phonetic_ainsworth.py
    - test_phonetic_waahlin.py
    - test_phonetic_reth_schek.py
    - test_phonetic_meta_soundex.py
    - test_phonetic_koelner.py
    - test_phonetic_fuzzy_soundex.py
    - test_phonetic_parmar_kumbharana.py
    - test_phonetic_beider_morse.py
    - test_phonetic_spanish_metaphone.py
    - test_phonetic_norphone.py
    - test_phonetic_phonix.py
    - test_phonetic_dolby.py
    - test_phonetic_metaphone.py
    - test_phonetic_haase.py
    - test_phonetic__phonetic.py
    - test_phonetic_soundex_br.py
    - test_phonetic_henry_early.py
    - test_phonetic_spfc.py
    - test_phonetic_nysiis.py
    - test_phonetic_davidson.py
    - test_phonetic_double_metaphone.py
    - test_phonetic_phonic.py
    - test_phonetic_fonem.py
    - test_phonetic_nrl.py
    - __init__.py
    - test_phonetic_phonet.py
    - test_phonetic_statistics_canada.py
    - test_phonetic_phonex.py
    - test_phonetic_phonetic_spanish.py
    - test_phonetic_roger_root.py
    - test_phonetic_onca.py
    - test_phonetic_pshp_soundex_last.py
    - test_phonetic_mra.py
    - test_phonetic_russell_index.py
    - test_phonetic_daitch_mokotoff.py
    - test_phonetic_eudex.py
    - test_phonetic_soundex.py
    - test_phonetic_caverphone.py
  - distance
    - test_distance_goodman_kruskal_tau_b.py
    - test_distance_maxwell_pilliner.py
    - test_distance_pearson_ii.py
    - test_distance_typo.py
    - test_distance_ncd_zlib.py
    - test_distance_ncd_lzss.py
    - test_distance_unknown_c.py
    - test_distance_fuzzywuzzy_token_set.py
    - test_distance_tarantula.py
    - test_distance_sift4.py
    - test_distance_sift4_extended.py
    - test_distance_vps.py
    - test_distance_gotoh.py
    - test_distance_maarel.py
    - test_distance_fellegi_sunter.py
    - test_distance_ident.py
    - test_distance__distance.py
    - test_distance_braun_blanquet.py
    - test_distance_quantitative_jaccard.py
    - test_distance_tf_idf.py
    - test_distance_roberts.py
    - test_distance_lcsseq.py
    - test_distance_sokal_michener.py
    - test_distance_synoname.py
    - test_distance_tulloss_t.py
    - test_distance_baulieu_iii.py
    - test_distance_baystat.py
    - test_distance_unknown_b.py
    - test_distance_jaro_winkler.py
    - test_distance_ncd_paq9a.py
    - test_distance_sift4_simplest.py
    - test_distance_positional_q_gram_dice.py
    - test_distance_bisim.py
    - test_distance_kulczynski_i.py
    - test_distance_yule_q_ii.py
    - test_distance_tichy.py
    - test_distance_kuhns_vii.py
    - test_distance_warrens_iv.py
    - test_distance_goodman_kruskal_lambda_r.py
    - test_distance_complete_linkage.py
    - test_distance_dispersion.py
    - test_distance_batagelj_bren.py
    - test_distance_mountford.py
    - test_distance_pattern.py
    - test_distance_mutual_information.py
    - test_distance_unknown_d.py
    - test_distance_marking_metric.py
    - test_distance_gini_ii.py
    - test_distance_jaccard_nm.py
    - test_distance_indel.py
    - test_distance_rouge_w.py
    - test_distance_jensen_shannon.py
    - test_distance_positional_q_gram_jaccard.py
    - test_distance_kuhns_i.py
    - test_distance_minkowski.py
    - test_distance_cohen_kappa.py
    - test_distance_yjhhr.py
    - test_distance_lorentzian.py
    - test_distance_gower_legendre.py
    - test_distance_sokal_sneath_ii.py
    - test_distance_johnson.py
    - test_distance_baulieu_xi.py
    - test_distance_raup_crick.py
    - test_distance_lcprefix.py
    - test_distance_mcconnaughey.py
    - test_distance_pearson_phi.py
    - test_distance_mlipns.py
    - test_distance_warrens_v.py
    - test_distance_bleu.py
    - test_distance_unknown_i.py
    - test_distance_weighted_jaccard.py
    - test_distance_kendall_tau.py
    - test_distance_covington.py
    - test_distance_editex.py
    - test_distance_flexmetric.py
    - test_distance_chebyshev.py
    - test_distance_mra.py
    - test_distance_baulieu_ii.py
    - test_distance_tulloss_s.py
    - test_distance_ncd_arith.py
    - test_distance_guttman_lambda_b.py
    - test_distance_rouge_su.py
    - test_distance_kuhns_xii.py
    - test_distance_rouge_l.py
    - test_distance_baulieu_iv.py
    - test_distance_kuhns_x.py
    - test_distance_sorgenfrei.py
    - test_distance_fuzzywuzzy_partial_string.py
    - test_distance_baulieu_i.py
    - test_distance_fossum.py
    - test_distance_masi.py
    - test_distance_warrens_iii.py
    - test_distance_steffensen.py
    - test_distance_unknown_a.py
    - test_distance_ample.py
    - test_distance_minhash.py
    - test_distance_faith.py
    - test_distance_russell_rao.py
    - test_distance_sokal_sneath_i.py
    - test_distance_baulieu_xiii.py
    - test_distance_rouge_s.py
    - test_distance_marking.py
    - test_distance_matusita.py
    - test_distance_kuhns_xi.py
    - test_distance_baulieu_vi.py
    - test_distance_yujian_bo.py
    - test_distance_azzoo.py
    - test_distance_softtf_idf.py
    - test_distance_kuhns_iv.py
    - test_distance_hamann.py
    - test_distance_chord.py
    - test_distance_consonni_todeschini_iv.py
    - test_distance_chao_dice.py
    - test_distance_doolittle.py
    - test_distance_bag.py
    - test_distance_yule_y.py
    - test_distance_overlap.py
    - test_distance_dice_asymmetric_i.py
    - test_distance_goodall.py
    - test_distance_yates_chi_squared.py
    - test_distance_stiles.py
    - test_distance_unknown_h.py
    - test_distance_unknown_f.py
    - test_distance_monge_elkan.py
    - test_distance_rees_levenshtein.py
    - test_distance_single_linkage.py
    - test_distance_kuhns_viii.py
    - test_distance_positional_q_gram_overlap.py
    - test_distance_unknown_k.py
    - test_distance_kuder_richardson.py
    - test_distance_fager_mcgowan.py
    - test_distance_quantitative_cosine.py
    - test_distance_tulloss_u.py
    - test_distance_smith_waterman.py
    - test_distance_dennis.py
    - test_distance_digby.py
    - test_distance_baulieu_vii.py
    - test_distance_kuhns_iii.py
    - test_distance_tversky.py
    - test_distance_bennet.py
    - test_distance_cormode_lz.py
    - test_distance_guth.py
    - test_distance_rogers_tanimoto.py
    - test_distance_strcmp95.py
    - test_distance_gini_i.py
    - test_distance_sokal_sneath_iii.py
    - test_distance_baroni_urbani_buser_ii.py
    - test_distance_consonni_todeschini_iii.py
    - test_distance_baulieu_ix.py
    - test_distance_clark.py
    - test_distance_lig3.py
    - test_distance_sokal_sneath_iv.py
    - test_distance_higuera_mico.py
    - test_distance_yule_q.py
    - test_distance_euclidean.py
    - test_distance_shapira_storer_i.py
    - test_distance_kuhns_ix.py
    - test_distance_gwet_ac.py
    - test_distance_michelet.py
    - test_distance_horn_morisita.py
    - test_distance__token_distance.py
    - test_distance_average_linkage.py
    - test_distance_cosine.py
    - test_distance_size.py
    - test_distance_aline.py
    - test_distance_ozbay.py
    - test_distance_length.py
    - test_distance_needleman_wunsch.py
    - test_distance_tetrachoric.py
    - test_distance_hellinger.py
    - test_distance_gilbert_wells.py
    - test_distance_dice.py
    - test_distance_inclusion.py
    - test_distance_whittaker.py
    - test_distance_unknown_m.py
    - test_distance_brainerd_robinson.py
    - test_distance_quantitative_dice.py
    - test_distance_morisita.py
    - test_distance_clement.py
    - test_distance_ssk.py
    - test_distance_guttman_lambda_a.py
    - test_distance_rogot_goldberg.py
    - test_distance_ncd_bwtrle.py
    - test_distance_dunning.py
    - test_distance_kuhns_v.py
    - test_distance_baulieu_x.py
    - test_distance_forbes_i.py
    - test_distance_jaccard.py
    - test_distance_hawkins_dotson.py
    - test_distance_hassanat.py
    - test_distance_goodman_kruskal_tau_a.py
    - test_distance_levenshtein.py
    - test_distance_relaxed_hamming.py
    - __init__.py
    - test_distance_consonni_todeschini_ii.py
    - test_distance_pearson_iii.py
    - test_distance_ratcliff_obershelp.py
    - test_distance_suffix.py
    - test_distance_baulieu_xiv.py
    - test_distance_generalized_fleiss.py
    - test_distance_upholt.py
    - test_distance_sokal_sneath_v.py
    - test_distance_peirce.py
    - test_distance_tulloss_r.py
    - test_distance_fleiss_levin_paik.py
    - test_distance_iterative_substring.py
    - test_distance_eudex.py
    - test_distance_kent_foster_i.py
    - test_distance_fidelity.py
    - test_distance_anderberg.py
    - test_distance_kent_foster_ii.py
    - test_distance_stuart_tau.py
    - test_distance_kuhns_vi.py
    - test_distance_ms_contingency.py
    - test_distance_forbes_ii.py
    - test_distance_lcsstr.py
    - test_distance_unknown_l.py
    - test_distance_discounted_levenshtein.py
    - test_distance_consonni_todeschini_i.py
    - test_distance_tarwid.py
    - test_distance_dice_asymmetric_ii.py
    - test_distance_gilbert.py
    - test_distance_cao.py
    - test_distance_warrens_ii.py
    - test_distance_unknown_e.py
    - test_distance_mcewen_michael.py
    - test_distance_benini_ii.py
    - test_distance_soft_cosine.py
    - test_distance_harris_lahey.py
    - test_distance_kulczynski_ii.py
    - test_distance_meta_levenshtein.py
    - test_distance_lcsuffix.py
    - test_distance_shape.py
    - test_distance_ncd_lzma.py
    - test_distance_cole.py
    - test_distance_bhattacharyya.py
    - test_distance_hurlbert.py
    - test_distance_koppen_ii.py
    - test_distance_consonni_todeschini_v.py
    - test_distance_fuzzywuzzy_token_sort.py
    - test_distance_canberra.py
    - test_distance_henderson_heron.py
    - test_distance_block_levenshtein.py
    - test_distance_unknown_j.py
    - test_distance_kuhns_ii.py
    - test_distance_andres_marzo_delta.py
    - test_distance_goodman_kruskal_lambda.py
    - test_distance_benini_i.py
    - test_distance_unknown_g.py
    - test_distance_baulieu_xv.py
    - test_distance_isg.py
    - test_distance_ncd_rle.py
    - test_distance_chao_jaccard.py
    - test_distance_scott_pi.py
    - test_distance_baulieu_viii.py
    - test_distance_damerau_levenshtein.py
    - test_distance_phonetic_distance.py
    - test_distance_saps.py
    - test_distance_q_gram.py
    - test_distance_pearson_heron_ii.py
    - test_distance_eyraud.py
    - test_distance_phonetic_edit_distance.py
    - test_distance_manhattan.py
    - test_distance_warrens_i.py
    - test_distance_baulieu_v.py
    - test_distance_prefix.py
    - test_distance_koppen_i.py
    - test_distance_unigram_subtuple.py
    - test_distance_hamming.py
    - test_distance_fleiss.py
    - test_distance_millar.py
    - test_distance_ncd_bz2.py
    - test_distance_baulieu_xii.py
    - test_distance_baroni_urbani_buser_i.py
    - test_distance_pearson_chi_squared.py
- pyproject.toml
- requirements.txt
- .gitignore
- docs
  - abydos.bib
  - history.rst
  - abydos.stemmer.rst
  - abydos.compression.rst
  - Makefile
  - make.bat
  - abydos.fingerprint.rst
  - faq.rst
  - intro.rst
  - abydos.distance.rst
  - abydos.rst
  - abydos.util.rst
  - abydos.stats.rst
  - abydos.phonetic.rst
  - _build
    - .gitignore
  - modules.rst
  - abydos.corpus.rst
  - requirements.txt
  - index.rst
  - conf.py
  - abydos.phones.rst
  - abydos.tokenizer.rst
  - _templates
    - .gitignore
  - _static
    - .gitignore
- binder
  - Basic Examples.ipynb
  - Reversed Metaphone using Keras seq2seq.ipynb
  - requirements.txt
  - Text Classification of Drug Reviews.ipynb
- .project
- MANIFEST.in
- tox.ini
- .pyup.yml

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.corpus._unigram_corpus.

Unigram Corpus
"""

import pickle  # noqa: S403
from codecs import open as c_open
from collections import Counter, defaultdict
from math import log1p
from typing import Any, Callable, DefaultDict, Optional, Tuple

from ..tokenizer import _Tokenizer

__all__ = ['UnigramCorpus']


def _dd_default(*args: Any) -> Tuple[int, int]:
    return 0, 0


class UnigramCorpus:
    """Unigram corpus class.

    Largely intended for calculating inverse document frequence (IDF) from a
    large corpus of unigram (or smaller) tokens, this class encapsulates a
    dict object. Each key is a unigram token whose value is a tuple consisting
    of the number of times a term appeared and the number of distinct documents
    in which it appeared.

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        corpus_text: str = '',
        documents: int = 0,
        word_transform: Optional[Callable[[str], str]] = None,
        word_tokenizer: Optional[_Tokenizer] = None,
    ) -> None:
        r"""Initialize UnigramCorpus.

        Parameters
        ----------
        corpus_text : str
            The corpus text as a single string
        documents : int
            The number of documents in the corpus. If equal to 0 (the default)
            then the maximum from the internal dictionary's distinct
            documents count.
        word_transform : function
            A function to apply to each term before term tokenization and
            addition to the corpus. One might use this, for example, to apply
            Soundex encoding to each term.
        word_tokenizer : _Tokenizer
            A tokenizer to apply to each sentence in order to retrieve the
            individual "word" tokens. If set to none, str.split() will be used.

        Example
        -------
        >>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
        >>> tqbf += 'And then it slept.\n And the dog ran off.'
        >>> corp = UnigramCorpus(tqbf)


        .. versionadded:: 0.4.0

        """
        self.corpus = defaultdict(
            _dd_default
        )  # type: DefaultDict[str, Tuple[int, int]]
        self.transform = word_transform
        self.tokenizer = word_tokenizer
        self.doc_count = documents

        self.add_document(corpus_text)

    def add_document(self, doc: str) -> None:
        """Add a new document to the corpus.

        Parameters
        ----------
        doc : str
            A string, representing the document to be added.


        .. versionadded:: 0.4.0

        """
        for word, count in Counter(doc.split()).items():
            self._add_word(word, count, 1)
        self.doc_count += 1

    def save_corpus(self, filename: str) -> None:
        """Save the corpus to a file.

        This employs pickle to save the corpus (a defaultdict). Other
        parameters of the corpus, such as its word_tokenizer, will not be
        affected and should be set during initialization.

        Parameters
        ----------
        filename : str
            The filename to save the corpus to.


        .. versionadded:: 0.4.0

        """
        with open(filename, mode='wb') as pkl:
            pickle.dump(self.corpus, pkl)

    def load_corpus(self, filename: str) -> None:
        """Load the corpus from a file.

        This employs pickle to load the corpus (a defaultdict). Other
        parameters of the corpus, such as its word_tokenizer, will not be
        affected and should be set during initialization.

        Parameters
        ----------
        filename : str
            The filename to load the corpus from.


        .. versionadded:: 0.4.0

        """
        with open(filename, mode='rb') as pkl:
            self.corpus = pickle.load(pkl)  # noqa: S301
        self._update_doc_count()

    def _update_doc_count(self) -> None:
        """Update document count, if necessary.

        .. versionadded:: 0.4.0
        """
        max_docs = max(self.corpus.values(), key=lambda _: _[1])[1]
        self.doc_count = max(max_docs, self.doc_count)

    def _add_word(self, word: str, count: int, doc_count: int) -> None:
        """Add a term to the corpus, possibly after tokenization.

        Parameters
        ----------
        word : str
            Word to add to the corpus
        count : int
            Count of word appearances
        doc_count : int
            Count of distinct documents in which word appears


        .. versionadded:: 0.4.0

        """
        if self.transform is not None:
            word = self.transform(word)

        if self.tokenizer is not None:
            self.tokenizer.tokenize(word)
            tokens = self.tokenizer.get_counter()
            for tok in tokens:
                n = tokens[tok] * count
                prior_count, prior_doc_count = self.corpus[tok]
                self.corpus[tok] = (
                    prior_count + n,
                    prior_doc_count + doc_count,
                )
        else:
            prior_count, prior_doc_count = self.corpus[word]
            self.corpus[word] = (
                prior_count + count,
                prior_doc_count + doc_count,
            )

    def gng_importer(self, corpus_file: str) -> None:
        """Fill in self.corpus from a Google NGram corpus file.

        Parameters
        ----------
        corpus_file : file
            The Google NGram file from which to initialize the n-gram corpus


        .. versionadded:: 0.4.0

        """
        with c_open(corpus_file, 'r', encoding='utf-8') as gng:
            for line in gng:
                word, _, count, doc_count = line.rstrip().split('\t')
                if '_' in word:
                    word = word[: word.find('_')]

                self._add_word(word, int(count), int(doc_count))
            self._update_doc_count()

    def idf(self, term: str) -> float:
        r"""Calculate the Inverse Document Frequency of a term in the corpus.

        Parameters
        ----------
        term : str
            The term to calculate the IDF of

        Returns
        -------
        float
            The IDF

        Examples
        --------
        >>> tqbf = 'the quick brown fox jumped over the lazy dog\n\n'
        >>> tqbf += 'and then it slept\n\n and the dog ran off'
        >>> corp = UnigramCorpus(tqbf)
        >>> round(corp.idf('dog'), 10)
        0.6931471806
        >>> round(corp.idf('the'), 10)
        0.6931471806


        .. versionadded:: 0.4.0

        """
        if term in self.corpus:
            count, term_doc_count = self.corpus[term]
            return log1p(self.doc_count / term_doc_count)
        else:
            return float('inf')


if __name__ == '__main__':
    import doctest

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)