java source code of ParallelSuffixArray

phrasal-master
- src
  - edu
    - stanford
      - nlp
        mt
        wordcls
        ClustererState.java
        PartialStateUpdate.java
        NgramHistory.java
        OneSidedObjectiveFunction.java
        MakeWordClasses.java
        preordering
        DependencyBnBPreorderer.java
        ClauseTypeLabeller.java
        RuleBasedGermanPreorderer.java
        decoder
        feat
        sparse
        TargetClassBigramBoundary.java
        RuleTargetDimension.java
        PrefixOverlap.java
        InDomainRule.java
        LengthRatio.java
        RulePunctuation.java
        PrefixAlignmentFeaturizer.java
        RuleUnalignedFeaturizer.java
        DistortionProbability.java
        RuleShape.java
        PunctuationDifference.java
        DiscriminativeSignedDistortion.java
        DiscriminativeAlignments.java
        RuleSourceDimension.java
        RuleProvenanceFeaturizer.java
        RuleIndicator.java
        TargetUnigramClass.java
        RuleAlignmentConstellation.java
        TargetFunctionWordInsertion.java
        RuleFertilityFeaturizer.java
        AlignmentDiagonalDistance.java
        DerivationFeaturizer.java
        FeaturizerState.java
        base
        HierarchicalReorderingFeaturizer.java
        PhrasePenaltyFeaturizer.java
        UnknownWordFeaturizer.java
        DTULinearDistortionFeaturizer.java
        WordPenaltyFeaturizer.java
        NGramLanguageModelFeaturizer.java
        SourceGapFeaturizer.java
        TranslationModelFeaturizer.java
        LinearFutureCostFeaturizer.java
        TargetGapFeaturizer.java
        LexicalReorderingFeaturizer.java
        Featurizer.java
        FeaturizerFactory.java
        NeedsCloneable.java
        package.html
        FeatureUtils.java
        FeatureExtractor.java
        RuleFeaturizer.java
        deplm
        GenerativeDependencyLanguageModelFeaturizer.java
        BBNDependencyLanguageModelFeaturizer.java
        PreorderingAgreement.java
        AbstractDependencyLanguageModelFeaturizer.java
        DTUDecoder.java
        CubePruningDecoder.java
        AbstractInferer.java
        h
        DTUIsolatedPhraseForeignCoverageHeuristic.java
        HeuristicFactory.java
        IsolatedPhraseForeignCoverageHeuristic.java
        NullHeuristic.java
        OptimisticForeignCoverageHeuristic.java
        package.html
        SearchHeuristic.java
        util
        TreeBeam.java
        SloppyBeam.java
        SparseScorer.java
        RuleGrid.java
        DTUHypothesis.java
        SoftPrefixOutputSpace.java
        OutputSpaceFactory.java
        MultiTranslationState.java
        ScorerFactory.java
        BundleBeam.java
        BeamFactory.java
        UnconstrainedOutputSpace.java
        State.java
        OutputSpace.java
        HyperedgeBundle.java
        StateLatticeDecoder.java
        UniformScorer.java
        SyntheticRules.java
        DenseScorer.java
        Beam.java
        SmartBundleBeam.java
        ConstrainedOutputSpace.java
        PrefixOutputSpace.java
        NbestListUtils.java
        Scorer.java
        DiverseNbestDecoder.java
        WrapBoundaryOutputSpace.java
        Derivation.java
        AbstractInfererBuilder.java
        AbstractBeamInfererBuilder.java
        Inferer.java
        recomb
        MetricBasedRecombinationFilter.java
        MSDRecombinationFilter.java
        CombinedRecombinationFilter.java
        SoftConstrainedDecodingRecombinationFilter.java
        RecombinationHash.java
        RecombinationFilter.java
        DTURecombinationFilter.java
        RecombinationHistory.java
        ExactRecombinationFilter.java
        SourceCoverageRecombinationFilter.java
        RecombinationFilterFactory.java
        LinearDistortionRecombinationFilter.java
        package.html
        NGramLMRecombinationFilter.java
        MultiBeamDecoder.java
        AbstractBeamInferer.java
        InfererBuilderFactory.java
        package.html
        InfererBuilder.java
        tune
        GreedyMultiTranslationMetricMax.java
        BatchOptimizer.java
        BeamMultiTranslationMetricMax.java
        MultiTranslationMetricMax.java
        OnlineUpdateRule.java
        AgendaMultiTranslationMetricMax.java
        MERT.java
        OnlineTuner.java
        BatchOptimizerFactory.java
        HillClimbingMultiTranslationMetricMax.java
        OnlineOptimizerFactory.java
        optimizers
        LineSearchOptimizer.java
        PairwiseRankingOptimizer.java
        OptimizerUtils.java
        SequenceOptimizer.java
        KoehnStyleOptimizer.java
        AdaGradUpdater.java
        CrossEntropyOptimizer.java
        ExpectedBLEUOptimizer.java
        MIRA1BestHopeFearOptimizer.java
        AbstractBatchOptimizer.java
        BasicPowellOptimizer.java
        AdaGradFOBOSUpdater.java
        AdaGradFastFOBOSUpdater.java
        PowellOptimizer.java
        DownhillSimplexMinimizer.java
        AbstractOnlineOptimizer.java
        PairwiseRankingOptimizerSGD.java
        DownhillSimplexOptimizer.java
        MiraUpdater.java
        CerStyleOptimizer.java
        SGDUpdater.java
        OnlineOptimizer.java
        package.html
        util
        Sequence.java
        NBestListContainer.java
        ListTopMultiTranslationMetricMax.java
        InputProperties.java
        CoreNLPCache.java
        MurmurHash2.java
        Featurizable.java
        DTUFeaturizable.java
        MurmurHash3.java
        CoverageSet.java
        IntegerArrayIndex.java
        PhraseAlignment.java
        ArraySequence.java
        AtomicBitSet.java
        ParallelCorpus.java
        SourceClassMap.java
        WordPredictionAccuracy.java
        ProbingIntegerArrayRawIndex.java
        IStrings.java
        FeatureValue.java
        Sequences.java
        FactoryUtil.java
        IString.java
        IBMModel1.java
        InputProperty.java
        KSR.java
        LineIndexedCorpus.java
        ScoredFeaturizedTranslation.java
        PositionIndependentDistance.java
        TrieIntegerArrayIndex.java
        AbstractSequence.java
        FeatureValueCollection.java
        SparseFeatureValueCollection.java
        IntegerArrayRawIndex.java
        BasicNBestList.java
        ParallelSuffixArrayEntry.java
        DenseFeatureValueCollection.java
        DynamicIntegerArrayIndex.java
        FeaturizedTranslation.java
        FeatureValues.java
        Vocabulary.java
        AbstractWordClassMap.java
        RichTranslation.java
        IOTools.java
        BasicNBestEntry.java
        TargetClassMap.java
        AlignedSentence.java
        ParallelSuffixArray.java
        TokenUtils.java
        TimingUtils.java
        FlatNBestList.java
        ProbingIntegerArrayIndex.java
        Phrasal.java
        lm
        LMState.java
        KenLMState.java
        KenLanguageModel.java
        ARPALMState.java
        LanguageModelFactory.java
        ARPALanguageModel.java
        LanguageModel.java
        KenLM.java
        NPLM.java
        tm
        TranslationModel.java
        TranslationModelFactory.java
        DTUTable.java
        DTURule.java
        LexicalReorderingTable.java
        CompiledPhraseTable.java
        Rule.java
        HierarchicalReorderingModel.java
        DynamicTranslationModel.java
        DynamicReorderingModel.java
        ExtendedLexicalReorderingTable.java
        SampledRule.java
        HierarchicalReorderingValidator.java
        IdentityPhraseGenerator.java
        PhraseTable.java
        AbstractDynamicReorderingModel.java
        PhraseTableEntry.java
        AbstractPhraseGenerator.java
        WordBasedReorderingModel.java
        CombinedTranslationModel.java
        UnknownWordPhraseGenerator.java
        ConcreteRule.java
        process
        en
        EnglishPreprocessor.java
        EnglishPostprocessor.java
        package.html
        IdentityPostprocessor.java
        CRFPreprocessor.java
        CRFPostprocessorFeatureFactory.java
        ProcessorFactory.java
        es
        SpanishPostprocessor.java
        SpanishPreprocessor.java
        ProcessorTools.java
        MosesCompoundSplitter.java
        fr
        FrenchPreprocessor.java
        FrenchPostprocessor.java
        zh
        ChinesePreprocessor.java
        de
        GermanPostprocessor.java
        GermanPreprocessor.java
        CoreNLPPreprocessor.java
        Preprocessor.java
        CRFPostprocessor.java
        package.html
        Postprocessor.java
        tools
        PrefixTagger.java
        PhrasalWeightsToMoses.java
        FilterNGramsByCoOccurrences.java
        SignificanceTest.java
        NISTTokenizer.java
        MTEvalXMLExtractor.java
        Lattice.java
        FindMetricMax.java
        OverrideBinwts.java
        BLEUSorter.java
        SelectCorpus.java
        TMXExtractor.java
        NBestArgmax.java
        LanguageModelPerplexity.java
        PrintWeights.java
        QueryTranslationModel.java
        FDACorpusSelection.java
        ConvertWeights.java
        NBestErrorSurface.java
        Evaluate.java
        CoverageChecker.java
        MinimumBayesRisk.java
        NbestEvaluationAnnotation.java
        NBestReranker.java
        BLEUGenreEvaluator.java
        CompareWeights.java
        AER.java
        LanguageModelTrueCaser.java
        DynamicRuleQuery.java
        FilterOOVByPhraseTable.java
        Preprocess.java
        SentenceLevelEvaluation.java
        package.html
        OnlineLearningCurve.java
        deplm
        SerializedDependencyToCoNLL.java
        BuildDependencyLM.java
        DependencyUtils.java
        BuildDependencyLMData.java
        DependencyProjector.java
        DependencyLanguageModelPerplexity2.java
        DependencyLanguageModelPerplexity.java
        DependencyLanguageModelScoreNBest.java
        DependencyProjectorCoNLL.java
        BuildDependencyLMData2.java
        NgramOverlap.java
        stats
        RandomDistribution.java
        Functions.java
        Distributions.java
        Concentration.java
        Sampling.java
        ConfidenceIntervals.java
        NumericalPrecision.java
        SimilarityMeasures.java
        train
        InDomainFeatureExtractor.java
        AlignmentSymmetrizer.java
        PhrasePrinter.java
        AlignmentGrid.java
        AbstractPhraseExtractor.java
        CountFeatureExtractor.java
        AlignmentTemplates.java
        PosTaggedSourceFilter.java
        AlGridCell.java
        AlignmentTemplateInstance.java
        SymmetricalWordAlignment.java
        SourceFilter.java
        DTUPhraseExtractor.java
        AbstractFeatureExtractor.java
        AbstractSourceFilter.java
        DiscontinuousSubSequences.java
        DTUInstance.java
        AbstractWordAlignment.java
        MosesPharoahFeatureExtractor.java
        PhrasalSourceFilter.java
        DynamicTMBuilder.java
        SoftPhraseExtractor.java
        ParaphraseExtractor.java
        PhraseExtract.java
        WordAlignment.java
        FlatPhraseExtractor.java
        LexicalReorderingFeatureExtractor.java
        PhraseExtractor.java
        DTUFeatureExtractor.java
        AlignmentTemplate.java
        PlainPhrasePrinter.java
        GIZAWordAlignment.java
        package.html
        FeatureExtractor.java
        DTUSourceFilter.java
        metrics
        LengthPenaltyBleu.java
        NISTMetric.java
        SLBLEUAfterPrefix.java
        SLLinearCombinationMetric.java
        IncrementalNBestEvaluationMetric.java
        WERMetric.java
        SentenceLevelMetricFactory.java
        AbstractTERMetric.java
        BLEUOracleCost.java
        SLTERMetric.java
        NumPredictedWordsMetric.java
        LinearCombinationMetric.java
        MetricUtils.java
        EvaluationMetric.java
        BLEUMetric.java
        NextPredictedWordMetric.java
        ScorerWrapperEvaluationMetric.java
        SLTERGain.java
        SLGeometricCombinationMetric.java
        RepetitionRate.java
        TERpMetric.java
        BLEUAfterPrefixMetric.java
        GeometricMeanCombinationMetric.java
        AbstractMetric.java
        IncrementalEvaluationMetric.java
        SentenceLevelMetric.java
        CorpusLevelMetricFactory.java
        LocalNumPredictedWordsMetric.java
        BLEUGain.java
        LocalNextPredictedWordMetric.java
        NgramPrecisionIncrementalMetric.java
        MaxPredictedWordsMetric.java
        package.html
        LengthMetric.java
        PERMetric.java
        benchmark
        HistogramTest.java
        TestHashSpeed.java
        DynamicTMThreading.java
        SequenceConcat.java
- ptm
  - wmt-rank
    - expected-wins.py
    - compute_agreement_scores.py
    - README
    - mfas_solver.py
    - wmtformat.py
    - lopezranking.py
  - tm
    - tm_nginx.conf
    - uwsgi_params
    - tmapp
      - forms_admin.py
      - templates
        translate.html
        survey.html
        training.html
        playback.html
        translate_demo.html
        index.html
        translate_playback.html
      - models.py
      - views.py
      - urls.py
      - forms.py
      - tests.py
      - controller.py
      - __init__.py
      - admin.py
      - model_utils.py
      - static
        tmapp
        css
        ui.css
        form.css
      - choices.py
    - templates
      - robots.txt
      - base.html
      - 404.html
      - 500.html
      - login.html
    - manage.py
    - uwsgi.ini
    - tm
      - wsgi.py
      - urls.py
      - __init__.py
      - settings.py
    - setup-server.sh
    - bin
      - compress
      - make-experiment.py
      - pilot-study.sh
      - make-experiment-pilot.py
    - logs
      - .gitignore
    - .gitignore
    - static
      - img
      - favicon.ico
      - js
        BrowserCheck.js
        modernizr.js
      - css
        base.css
        PTM.css
  - scripts
    - sql
      - exp1_docs.sql
      - dump_exit_survey.sql
      - dump_invalid.sql
      - extra_experiments.sql
      - dump_logs.sql
      - dumpdb.sql
      - default_db.sql
    - tsv_to_targettxt_csv.py
    - load_source_doc.sh
    - human_eval
      - permute_file.py
      - ans2csv.sh
      - csv2ranking.py
      - copy_tgt_to_maise.py
      - README
      - aws.sh
      - mk_batch.sh
      - mfas_solver.py
      - mk_taskdir.sh
      - mk_task.sh
    - load_target_doc.sh
    - csv_to_postgres.sh
    - cleanwebdoc.py
    - README
    - run_sql_script.sh
    - doc2tsv.py
    - mt
      - README
      - filter_counts_file.py
      - tokenize_counts_file.sh
    - setup_default_db.sh
    - add_extra_translations.sh
    - export_sqlite3_script.sh
    - csv_unicode.py
    - wrangle
      - make_translation_frame.py
      - filter_users.py
      - ptm_stats.py
      - imt_extract_translations.py
      - actionlog_hover_events_to_json.py
      - ptm_file_io.py
      - imt_time_stats_from_dump.py
      - proc_dbdump.py
      - edit_distance.py
      - js_keycodes.tsv
      - make_user_frame.py
      - giga_vocab.sh
      - token_counts.py
      - imt_translations_to_appraise.sh
      - imt_extract_ui_events.py
      - csv_unicode.py
      - actionlog_to_csv.py
      - make_pause_dist_from_actionlog.py
      - make_source_frame.py
      - event_diff_test.py
      - imt_utils.py
      - mt_clustering_test.py
      - time_diff_test.py
    - tsv_to_sql_csv.py
- resources
  - log4j2.xml
- ems
  - README
  - phrasal-ems.html
- src-cc
  - compile_JNI.sh
  - edu_stanford_nlp_mt_lm_KenLM.cc
  - README
  - kenlm
    - bjam
    - BUILDING
    - COPYING.LESSER.3
    - jam-files
      - empty_test_main.cc
      - sanity.jam
      - fail
        Jamroot
      - LICENSE_1_0.txt
      - engine
        mem.c
        timestamp.h
        jam.h
        pathsys.c
        timestamp.c
        execnt.c
        command.c
        jamgram.y
        lists.c
        output.h
        fileunix.c
        function.h
        class.h
        parse.c
        pathunix.c
        filesys.h
        lists.h
        builtins.h
        scan.c
        debug.c
        hash.c
        build.jam
        search.c
        debug.h
        hcache.c
        subst.h
        scan.h
        constants.h
        constants.c
        jambase.h
        compile.c
        mkjambase.c
        frames.c
        yyacc.c
        headers.h
        function.c
        debian
        jam.man.sgml
        changelog
        rules
        copyright
        control
        md5.c
        patchlevel.h
        class.c
        jambase.c
        rules.h
        bump_version.py
        strings.h
        build.bat
        output.c
        frames.h
        hash.h
        option.c
        jamgramtab.h
        rules.c
        glob.c
        variable.h
        regexp.h
        make.h
        pathnt.c
        native.c
        filesys.c
        boost-no-inspect
        jamgram.c
        regexp.c
        execcmd.c
        modules.h
        builtins.c
        cwd.c
        subst.c
        w32_getreg.c
        search.h
        make.c
        Jambase
        jamgram.h
        boost-jam.spec
        jamgram.yy
        make1.c
        strings.c
        md5.h
        filent.c
        option.h
        hdrmacro.c
        hcache.h
        jam.c
        native.h
        execcmd.h
        mem.h
        command.h
        hdrmacro.h
        compile.h
        object.c
        execunix.c
        modules
        path.c
        property-set.c
        readme.txt
        set.c
        regex.c
        sequence.c
        order.c
        object.h
        build.sh
        parse.h
        cwd.h
        pathsys.h
        modules.c
        headers.c
        variable.c
      - boost-build
        user-config.jam
        kernel
        modules.jam
        bootstrap.jam
        boost-build.jam
        errors.jam
        class.jam
        options
        help.jam
        build
        property-set.jam
        feature.jam
        virtual-target.jam
        alias.jam
        property.jam
        build-request.jam
        configure.jam
        readme.txt
        targets.jam
        scanner.jam
        config-cache.jam
        ac.jam
        toolset.jam
        version.jam
        generators.jam
        project.jam
        type.jam
        bootstrap.jam
        boost-build.jam
        site-config.jam
        util
        option.jam
        doc.jam
        string.jam
        print.jam
        numbers.jam
        regex.jam
        assert.jam
        path.jam
        utility.jam
        indirect.jam
        set.jam
        os.jam
        order.jam
        sequence.jam
        container.jam
        build-system.jam
        tools
        xsltproc-config.jam
        stlport.jam
        qt5.jam
        cast.jam
        quickbook-config.jam
        testing-aux.jam
        hpfortran.jam
        python.jam
        unix.jam
        xsltproc
        included.xsl
        test.xml
        test.xsl
        lex.jam
        notfile.jam
        mpi.jam
        message.jam
        doxygen-config.jam
        symlink.jam
        qcc.jam
        auto-index.jam
        tiff.jam
        darwin.jam
        fortran.jam
        como-win.jam
        stage.jam
        sun.jam
        fop.jam
        package.jam
        qt4.jam
        convert.jam
        cw-config.jam
        clang.jam
        qt3.jam
        xsltproc.jam
        png.jam
        zlib.jam
        boostbook-config.jam
        pathscale.jam
        doxygen
        windows-paths-check.hpp
        windows-paths-check.doxyfile
        builtin.jam
        whale.jam
        como.jam
        docutils.jam
        intel-win.jam
        hp_cxx.jam
        intel-darwin.jam
        gfortran.jam
        clang-darwin.jam
        jpeg.jam
        acc.jam
        python-config.jam
        vacpp.jam
        pch.jam
        mc.jam
        bison.jam
        msvc-config.jam
        xlf.jam
        dmc.jam
        pgi.jam
        midl.jam
        mipspro.jam
        boostbook.jam
        intel.jam
        types
        obj.jam
        lib.jam
        exe.jam
        register.jam
        asm.jam
        html.jam
        objc.jam
        cpp.jam
        preprocessed.jam
        rsp.jam
        qt.jam
        cray.jam
        msvc.jam
        como-linux.jam
        common.jam
        testing.jam
        generate.jam
        gettext.jam
        clang-linux.jam
        ifort.jam
        gcc.jam
        cw.jam
        quickbook.jam
        borland.jam
        rc.jam
        intel-linux.jam
        doxygen.jam
        make.jam
        qt.jam
    - Doxyfile
    - compile_query_only.sh
    - clean_query_only.sh
    - LICENSE
    - util
      - read_compressed.cc
      - bit_packing.hh
      - getopt.hh
      - pool.cc
      - file.cc
      - read_compressed_test.cc
      - bit_packing.cc
      - proxy_iterator.hh
      - Jamfile
      - thread_pool.hh
      - string_piece.cc
      - float_to_string.cc
      - have.hh
      - integer_to_string.cc
      - tokenize_piece_test.cc
      - file_piece.cc
      - stream
        stream_test.cc
        timer.hh
        Jamfile
        multi_progress.hh
        config.hh
        block.hh
        io.hh
        io.cc
        chain.cc
        multi_stream.hh
        sort_test.cc
        line_input.hh
        count_records.hh
        count_records.cc
        rewindable_stream.hh
        chain.hh
        sort.hh
        CMakeLists.txt
        stream.hh
        io_test.cc
        rewindable_stream.cc
        rewindable_stream_test.cc
        line_input.cc
        multi_progress.cc
      - murmur_hash.hh
      - string_piece.hh
      - probing_hash_table_test.cc
      - exception.hh
      - probing_hash_table_benchmark_main.cc
      - file_piece.hh
      - cat_compressed_main.cc
      - ersatz_progress.cc
      - fixed_array.hh
      - joint_sort_test.cc
      - pcqueue.hh
      - read_compressed.hh
      - sized_iterator_test.cc
      - bit_packing_test.cc
      - parallel_read.cc
      - float_to_string.hh
      - usage.hh
      - probing_hash_table.hh
      - file_piece_test.cc
      - sorted_uniform.hh
      - integer_to_string_test.cc
      - string_piece_hash.hh
      - usage.cc
      - murmur_hash.cc
      - ersatz_progress.hh
      - CMakeLists.txt
      - multi_intersection.hh
      - sized_iterator.hh
      - pool.hh
      - sorted_uniform_test.cc
      - exception.cc
      - integer_to_string.hh
      - fake_ofstream.hh
      - file.hh
      - double-conversion
        Jamfile
        fixed-dtoa.cc
        fast-dtoa.h
        bignum.cc
        diy-fp.h
        LICENSE
        ieee.h
        bignum-dtoa.h
        utils.h
        double-conversion.h
        strtod.cc
        cached-powers.h
        strtod.h
        CMakeLists.txt
        bignum-dtoa.cc
        double-conversion.cc
        diy-fp.cc
        fixed-dtoa.h
        cached-powers.cc
        fast-dtoa.cc
        bignum.h
      - multi_intersection_test.cc
      - mmap.cc
      - parallel_read.hh
      - pcqueue_test.cc
      - mmap.hh
      - getopt.c
      - joint_sort.hh
      - scoped.cc
      - tokenize_piece.hh
      - scoped.hh
    - lm
      - return.hh
      - state.hh
      - partial.hh
      - Jamfile
      - value_build.cc
      - config.hh
      - sizes.hh
      - bhiksha.hh
      - trie_sort.cc
      - model.hh
      - search_hashed.cc
      - lm_exception.cc
      - left.hh
      - read_arpa.cc
      - kenlm_benchmark_main.cc
      - vocab.cc
      - max_order.hh
      - builder
        payload.hh
        TODO
        adjust_counts.cc
        Jamfile
        debug_print.hh
        output.hh
        discount.hh
        pipeline.cc
        interpolate.cc
        adjust_counts.hh
        corpus_count.hh
        interpolate.hh
        dump_counts_main.cc
        lmplz_main.cc
        pipeline.hh
        output.cc
        adjust_counts_test.cc
        CMakeLists.txt
        corpus_count_test.cc
        README.md
        corpus_count.cc
        initial_probabilities.hh
        header_info.hh
        combine_counts.hh
        hash_gamma.hh
        initial_probabilities.cc
      - word_index.hh
      - query_main.cc
      - blank.hh
      - partial_test.cc
      - common
        Jamfile
        model_buffer.hh
        print.hh
        model_buffer.cc
        ngram_stream.hh
        print.cc
        special.hh
        size_option.hh
        ngram.hh
        renumber.cc
        compare.hh
        renumber.hh
        size_option.cc
        CMakeLists.txt
        joint_order.hh
      - virtual_interface.cc
      - fragment_main.cc
      - quantize.hh
      - test.arpa
      - test_nounk.arpa
      - search_trie.cc
      - value_build.hh
      - model_type.hh
      - search_hashed.hh
      - neural
        Jamfile
        wordvecs.hh
      - trie.cc
      - model.cc
      - vocab.hh
      - quantize.cc
      - trie.hh
      - bhiksha.cc
      - CMakeLists.txt
      - search_trie.hh
      - lm_exception.hh
      - facade.hh
      - sizes.cc
      - ngram_query.hh
      - binary_format.cc
      - enumerate_vocab.hh
      - interpolate
        Jamfile
        bounded_sequence_encoding_test.cc
        split_worker.cc
        bounded_sequence_encoding.cc
        pipeline.cc
        normalize.hh
        merge_test
        test_no_unk
        test3
        test1
        test2
        test_bad_order
        backoff_reunification.cc
        tune_derivatives_test.cc
        train_params_grant_main.cc
        split_worker.hh
        tune_instance.cc
        tune_main.cc
        interpolate_main.cc
        normalize_test.cc
        merge_probabilities.cc
        train_params_main.cc
        interpolate_info.hh
        merge_vocab.cc
        toy_data
        toy1.lm
        toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm
        toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm
        toy2.lm
        tune_derivatives.cc
        pipeline.hh
        tune_instance_test.cc
        bounded_sequence_encoding.hh
        universal_vocab.hh
        enumerate_global_vocab.hh
        universal_vocab.cc
        tune_derivatives.hh
        backoff_reunification_test.cc
        backoff_reunification.hh
        merge_probabilities.hh
        tune_instance.hh
        merge_vocab_test.cc
        perf_enum_gv_main.cc
        streaming_example_main.cc
        normalize.cc
        tune_instance_data
        toy1.1
        toy0.kenlm_intermediate
        toy0.vocab
        toy0.2
        toy1.vocab
        toy0.3
        toy0.1
        toy1.2
        toy1.3
        toy1.kenlm_intermediate
        generate.sh
        backoff_matrix.hh
        merge_vocab.hh
        enumerate_global_vocab.cc
      - model_test.cc
      - read_arpa.hh
      - trie_sort.hh
      - left_test.cc
      - filter
        thread.hh
        Jamfile
        wrapper.hh
        count_io.hh
        vocab.cc
        arpa_io.cc
        filter_main.cc
        phrase.cc
        vocab.hh
        CMakeLists.txt
        format.hh
        arpa_io.hh
        phrase_table_vocab_main.cc
        phrase.hh
      - wrappers
        nplm.cc
        README
        nplm.hh
      - weights.hh
      - config.cc
      - binary_format.hh
      - build_binary_main.cc
      - virtual_interface.hh
      - value.hh
    - COPYING
    - setup.py
    - windows
      - kenlm.sln
      - build_binary.vcxproj
      - ngram_query.vcxproj
      - lmplz.vcxproj
      - kenlm.vcxproj
    - python
      - example.py
      - kenlm.pyx
      - _kenlm.pxd
    - Jamroot
    - CMakeLists.txt
    - README.md
    - COPYING.3
    - .gitignore
  - edu_stanford_nlp_mt_lm_NPLM.cc
- test
  - edu
    - stanford
      - nlp
        mt
        decoder
        util
        RuleGridTest.java
        util
        AtomicBitSetTest.java
        MurmurHashTest.java
        VocabularyTest.java
        ProbingIntegerArrayIndexTest.java
        AlignedSentenceTest.java
        CoverageSetTest.java
        IStringTest.java
        SequenceTest.java
        ProbingIntegerArrayRawIndexTest.java
        FlatNBestListTest.java
        PositionIndependentDistanceTest.java
        PhraseAlignmentTest.java
        FeatureValueTest.java
        TestMurmurHash3.java
        ParallelCorpusTest.java
        lm
        ARPALanguageModelTest.java
        KenLanguageModelTest.java
        NPLMTest.java
        stats
        FunctionsTest.java
        DistributionsTest.java
- scripts-private
  - ibm2class
  - de-morph.py
  - ucb-align-posterior.conf
  - combine_crf_hmm
  - ibm2noclass
  - moses-train.sh
  - nist-domains.py
  - split-domains.py
  - tolower-utf8.py
  - de-detokenize.sh
  - extract-hypen-caps.pl
  - srilm
    - lm-counts.sh
    - lm.local
    - lm-compile.sh
    - lm-class-based.sh
  - rerank.pl
  - extract_refs.pl
  - tune-imt.sh
  - en_lm_truecaser
  - ibm_sgml2xml
  - ibm_tok
  - bolt_zh_postprocess.py
  - mk-systemdir.sh
  - phrase-viewer
  - giga_sgml2plain.py
  - wmt-common-post.pl
  - phrasal-test.sh
  - en_crf_truecaser
  - de_recase_by_POS_tag.pl
  - phrasal_output2vizualize_xml
  - apply_hypen_dict.pl
  - bolt
    - score.pl
    - catWithRepetitions.pl
    - process_test_sets.pl
    - make_composite_ref.pl
    - merge_mbr_very_verbose.py
    - randomlySelectShortSentences.pl
    - dataTool.pl
    - create_split_set.pl
    - promote_new_1best.pl
    - sum_fields.pl
    - make_genre_stats.pl
    - createIntegrationTest.pl
    - make_bitext_file_info.pl
    - make_concatenated_bitext.pl
    - renumber_nbest_from_zero.pl
    - get_length_ratio.pl
    - make_package_for_ibm.pl
    - truncate_nbest.pl
    - makeShortNBest.pl
    - extractTextFromPhraseTable.pl
    - convert_ibm_rbt.pl
    - split_nbest.pl
    - convert_ibm_sgm_to_plain_text.pl
    - subselectUsingIndices.pl
    - replaceString.pl
  - input_ibm2phrasal
  - neural
    - tune_reranker.sh
    - test_reranker.sh
    - toy.nbest
    - extract_nbest.py
    - toy.score2
    - README
    - toy.score1
    - wait_until_file_exists.pl
    - add_neural_scores.sh
    - clean_nbest.py
    - toy.allScores
    - nbest_add_neural_score.py
    - extract_1best.pl
    - add_length_to_nbest.pl
    - run_rerank.sh
    - tune_and_test_reranker.pl
  - make-phrasal-release.sh
  - normalize_to_orig.pl
  - p2m-phrase-table.py
  - de-post.py
  - tokenize.sh
  - de-rules.py
  - post_process_ibm.py
  - sentence-split
  - extract-liblinear-weights.py
  - shuffle-bitext.py
  - lm_ibm2phrasal
  - mkconf.py
  - moses-test.sh
  - normalize_to_rbt.pl
  - en_truecaser
  - fr-detokenize.sh
  - deduplicate.py
  - ptable_ibm2phrasal
  - tokenize-BOLTonly.sh
  - merge_casings.py
  - moses-tune.sh
  - ucb-align.conf
  - cleanup_txt.py
  - unfiltered-ptable.sh
  - ibmnbest.pl
  - align.sh
- lib
- src-extra
  - edu
    - stanford
      - nlp
        mt
        tune
        optimizers
        RandomAltPairs.java
        BetterWorse3KMeans.java
        SoftmaxMaxMarginSlackRescaling.java
        LogLinearMultiTarget.java
        BetterWorseCentroids.java
        FullKMeans.java
        RandomPairs.java
        PerceptronOptimizer.java
        PointwisePerceptron.java
        SoftmaxMaxMarginMarkovNetwork.java
        RandomNBestPoint.java
        BetterWorse2KMeans.java
        visualize
        phrase
        PhraseGUI.java
        PhraseController.java
        TranslationLayout.java
        Phrase.java
        Translation.java
        PathDialog.java
        PhraseViewer.java
        phrase-viewer-paths.xsd
        AnalysisDialog.java
        ScoreDistribution.java
        PathModel.java
        VisualPhrase.java
        ClickEventListener.java
        PhraseModel.java
        ClickEvent.java
        OptionsDialog.java
        NamedLabel.java
        service
        PhrasalServlet.java
        handlers
        TranslationQuery.java
        TranslationRequestHandlerMock.java
        ServiceResponse.java
        RequestHandler.java
        RuleQuery.java
        UnknownRequestHandler.java
        RuleQueryRequestHandler.java
        ScoredQuery.java
        TranslationRequestHandler.java
        RuleQueryRequestHandlerMock.java
        PhrasalService.java
        Messages.java
        tools
        SourceDocument.java
        SourceTextAnalyzer.java
        RawFrenchToJSON.java
        SourceSegment.java
        CoreNLPToJSON.java
        DecodePrefixFile.java
        tools
        ComputeBitextIDF.java
        ModifiedMooreLewisCorpusSelection.java
        TranslationModelComparator.java
        BLEUConditionSignificanceTest.java
        MakePTMPhrasalInput.java
        MooreLewisCorpusSelection.java
        SplitByInterfaceCondition.java
        SentencelevelBLEUVariance.java
- build.gradle
- example
  - README
  - example.vars
  - example.binwts
  - example.ini
  - ucb-align.conf
- README.md
- scripts
  - separate_lines
  - printFeatureNames.sh
  - phrasal-mert.pl
  - nist_tok
  - mt_utils.pm
  - make_html_analysis.pl
  - phrasal.sh
  - nbest2uniq
  - word2class.sh
  - mteval
  - arg_utils.pm
  - link-best-ini
  - train-postprocessor.sh
  - nbest2plain
  - ter
  - plain2sgml
  - align2grid.pl
  - sgml_addsysid
  - nbest21best
  - bleu
  - phrasal-experiments.sh
  - en_detokenizer
  - reverse-align
  - align
  - sgml_filter
  - update_ini
  - clean-corpus.py
  - nist_tok_sgml
  - remove_unk_before_decode
  - oov-rate-for-file.py
  - output2html
  - kenlm.sh
  - select_data_with_lm+len_model
  - web-service.sh
  - sort_by_bleu
  - recase.sh
  - mtij.py
  - fda_select.sh
  - terp
  - mbr2plain.py
  - sgml2plain
  - construct_pinyin_table.pl
  - data_utils.pm
  - joshua2dtu
  - make-prefixes.py
  - align2grid_withPos.pl
- test-resources
  - inputs
    - sample.ini
    - mt06.msd-bidirectional-fe.gz
    - mt06.phrase-table.gz
    - properSample.test
    - dev12tune.lo-hier.msd2-bidirectional-fe.gz
    - improperFeatures.test
    - dev12tune.phrase-table.gz
    - improperIDs.test
    - empty.wts
- .gitignore
- LICENSE.txt
- doc
  - featureapi
    - DiscriminativePhraseTable.java
    - phrasal.online.ini
    - DerivationFeaturizer.java
    - WordPenaltyFeaturizer.java
    - NGramLanguageModelFeaturizer.java
    - phrasal-featureapi.lyx
    - RuleFeaturizer.java
  - michel_config
    - phrasal
      - sample
        Makefile
        ce-tiny.ini
        Makefile.local
        mt06.sgm
        mt05.sgm
      - Makefile
      - README
      - GALE
        P5-ae-text-hier2-d5.wts
        post-nw.sh
        P5-ae-audio-hier2-d5.ini
        P4-ae-audio-hier2-d5.ini
        P5-ae-text-hier2-d5.ini
        Makefile.local.P5-audio
        post-bn.sh
        Makefile.local.P4-text
        post-wb.sh
        P5-ae-audio-hier2-d5.wts
        P4-ae-audio-hier2-d5.wts
        P4-ae-text-hier2-d5.wts
        Makefile.local.P5-text
        post-bc.sh
        P4-ae-text-hier2-d5.ini
      - NIST
        ae-word-d4.ini
        ce-p2h2-d6.wts
        ae-none-d4.ini
        ce-p2h2-d6.ini
        Makefile.local.ae
        ce-hier2-d6.ini
        Makefile.local.ce
        Makefile.local.ce.subset
      - NAACL2010
        ce-src18.wts
        ce-src18.ini
        Makefile.local
      - scripts
        check-local-makefile
      - Makefile.research
    - lm
      - README
      - scripts
        LDC2009T13-giga4-xin.sh
        ce_mt_g3_afp_xin_1222
        init.sh
        ce_mt_g3_afp_xin_1233
        ae_mtFB_g4_afp_xin_1233
        ae_mtF_g3_afp_xin_1233
        ae_mtFB_g4_afp_xin_1222
        ibm-tokenizer
        eng_token_patterns
        eng_tokenizer.pl
        eng_token_list
        eng_simpl_class.pl
        eng_simpl_norm.pl
        ae_mt_g3_afp_xin_1233_cased
        LDC2009T13-giga4-cna.sh
        LDC2009T13-giga4-apw.sh
        clean_ibm
        LDC2009T13-giga4-afp.sh
        ae_bbn_1233
        ae_mt_g3_afp_xin_1233
      - train
        Makefile
      - input
        Makefile.ae.local-gale
        Makefile.ae.local-nist
        Makefile.ae
        Makefile.ce.local
        Makefile.ce
    - align
      - split5.conf
      - split3.conf
      - Makefile
      - split1.conf
      - split2.conf
      - Makefile.local.P4-arabic
      - split4.conf
      - split7.conf
      - lists.P4-arabic
        list.clean
        list.other7
        list.other5
        list.other4
        list.other6
        list.other2
        list.other1
        list.other3
      - split6.conf
    - documentation
      - FAQ.txt
      - README.1st
      - what_works_best.txt
      - STEPS.txt
    - align_giza
      - toy.fr
      - Makefile
      - README
      - toy.en
      - Makefile.local.template
  - CorpusSelectionNotes.txt
  - original-experiment.txt

package edu.stanford.nlp.mt.util;

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.KryoSerializable;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;

import edu.stanford.nlp.mt.util.TimingUtils.TimeKeeper;

/**
 * An implementation of a parallel suffix array.
 * 
 * NOTE: The fields are protected, non-final for fast serialization/deserialization.
 * 
 * @author Spence Green
 *
 */
public class ParallelSuffixArray implements Serializable,KryoSerializable {

  private static final long serialVersionUID = -5403502473957235135L;

  private static final Logger logger = LogManager.getLogger(ParallelSuffixArray.class);
  
  protected int[] srcBitext;
  protected int[] f2e;
  protected int[] tgtBitext;
  protected int[] e2f;
  protected int[] srcSuffixArray; 
  protected int[] tgtSuffixArray;
  
  protected int numSentences;
  protected Vocabulary vocabulary;
  
  // Cache unigram positions in the target for the count() function.
  // The sample function already supports initialization with bounds, which
  // the calling method should maintain.
  protected transient int[] tgtCountLBCache;
  protected transient int[] tgtCountUBCache;
  
  /**
   * No-arg constructor for deserialization.
   */
  public ParallelSuffixArray() {}

  /**
   * Constructor. Careful. This constructor doubles peak memory.
   * 
   * @param corpus
   */
  public ParallelSuffixArray(ParallelCorpus corpus) {
    loadCorpus(corpus);
  }
  
  /**
   * Constructor. Memory-efficient for large files.
   * 
   * @param sourceFile
   * @param targetFile
   * @param alignFile
   * @param expectedSize
   * @throws IOException 
   */
  public ParallelSuffixArray(String sourceFile, String targetFile, String alignFile) throws IOException {
    loadCorpus(sourceFile, targetFile, alignFile);
  }
  

  @Override
  public void write(Kryo kryo, Output output) {
    writeArray(srcBitext, output);
    writeArray(tgtBitext, output);
    writeArray(e2f, output);
    writeArray(f2e, output);
    writeArray(srcSuffixArray, output);
    writeArray(tgtSuffixArray, output);
    output.writeInt(numSentences, true);
    kryo.writeObject(output, vocabulary);
  }

  private static void writeArray(int[] arr, Output output) {
    output.writeInt(arr.length, true);
    output.writeInts(arr, true);
  }

  @Override
  public void read(Kryo kryo, Input input) {
    srcBitext = readArray(input);
    tgtBitext = readArray(input);
    e2f = readArray(input);
    f2e = readArray(input);
    srcSuffixArray = readArray(input);
    tgtSuffixArray = readArray(input);
    numSentences = input.readInt(true);
    vocabulary = kryo.readObject(input, Vocabulary.class);
  }
  
  private static int[] readArray(Input input) {
    int len = input.readInt(true);
    return input.readInts(len, true);
  }

  /**
   * Get the index associated with this suffix array.
   * 
   * @return
   */
  public Vocabulary getVocabulary() { return vocabulary; }
  
  /**
   * Return a stream of the sentence pairs in this bitext.
   * 
   * @return
   */
  public Stream<SentencePair> stream() {
    return IntStream.range(0, srcBitext.length).mapToObj(i -> {
      if (srcBitext[i] < 0) {
        return new SentencePair(i-1);
      } else {
        return null;
      }
    }).filter(o -> o != null);
  }
  
  /**
   * Return a stream of the sentence pairs in this bitext.
   * 
   * @return
   */
  public Stream<SentencePair> parallelStream() {
    return IntStream.range(0, srcBitext.length).parallel().mapToObj(i -> {
      if (srcBitext[i] < 0) {
        return new SentencePair(i-1);
      } else {
        return null;
      }
    }).filter(o -> o != null);
  }
  
  /**
   * Streaming loader, which does not double peak memory like the loader
   * that creates a suffix array from a parallel corpus.
   * 
   * @param source
   * @param target
   * @param align
   * @throws IOException 
   */
  private void loadCorpus(String source, String target, String align) throws IOException {
    logger.info("Counting the number of corpus positions");
    TimeKeeper timer = TimingUtils.start();
    // Read in the files once to count the sentences and corpus positions
    int numSourcePositions = 0;
    int numTargetPositions = 0;
    numSentences = 0;
    ParallelCorpus corpus = new ParallelCorpus(1);
    try (LineNumberReader fReader = IOTools.getReaderFromFile(source)) {
      LineNumberReader eReader = IOTools.getReaderFromFile(target);
      LineNumberReader aReader = IOTools.getReaderFromFile(align);
      for (String fLine; (fLine = fReader.readLine()) != null; ) {
        String eLine = eReader.readLine();
        String aLine = aReader.readLine();
        AlignedSentence example = corpus.getSentence(fLine, eLine, aLine);
        if (example != null) {
          numSourcePositions += example.sourceLength();
          numTargetPositions += example.targetLength();
          ++numSentences;
        }
      }
    }
    final int initialVocabularySize = corpus.getVocabulary().size();
    timer.mark("Counting corpus positions");
    logger.info("Source positions: {}  Target positions: {}  Sentences: {}", numSourcePositions, 
        numTargetPositions, numSentences);
    
    // Create the arrays
    final int srcLength = numSourcePositions + numSentences;
    if (srcLength < 0) throw new RuntimeException("Maximum source bitext size exceeded");
    srcBitext = new int[srcLength];
    f2e = new int[srcLength];
    final int tgtLength = numTargetPositions + numSentences;
    if (tgtLength < 0) throw new RuntimeException("Maximum target bitext size exceeded");
    tgtBitext = new int[tgtLength];
    e2f = new int[tgtLength];
    
    // Create the arrays and read the files again
    try (LineNumberReader fReader = IOTools.getReaderFromFile(source)) {
      LineNumberReader eReader = IOTools.getReaderFromFile(target);
      LineNumberReader aReader = IOTools.getReaderFromFile(align);
      int srcOffset = 0;
      int tgtOffset = 0;
      for (String fLine; (fLine = fReader.readLine()) != null; ) {
        String eLine = eReader.readLine();
        String aLine = aReader.readLine();
        AlignedSentence sentence = corpus.getSentence(fLine, eLine, aLine);
        if (sentence == null) {
          logger.info("Discarding parallel example {}", fReader.getLineNumber());
        } else {
          System.arraycopy(sentence.source, 0, srcBitext, srcOffset, sentence.sourceLength());
          System.arraycopy(sentence.f2e, 0, f2e, srcOffset, sentence.f2e.length);
          System.arraycopy(sentence.target, 0, tgtBitext, tgtOffset, sentence.targetLength());
          System.arraycopy(sentence.e2f, 0, e2f, tgtOffset, sentence.e2f.length);
          srcOffset += sentence.sourceLength();
          tgtOffset += sentence.targetLength();
          // Source points to target
          srcBitext[srcOffset] = toSentenceOffset(tgtOffset);
          // Target points to source
          tgtBitext[tgtOffset] = toSentenceOffset(srcOffset);
          ++srcOffset;
          ++tgtOffset;
        }        
      }
    }
    this.vocabulary = corpus.getVocabulary();
    assert initialVocabularySize == vocabulary.size();
    timer.mark("Loading corpus");
    logger.info("Done loading corpus: {}", timer);
  }
  
  /**
   * Load the parallel corpus into a contiguous block of memory.
   * Set the corpus reference to null after this call to free memory.
   * 
   * @param corpus
   */
  private void loadCorpus(ParallelCorpus corpus) {
    logger.info("Flattening parallel corpus");
    TimeKeeper timer = TimingUtils.start();
    numSentences = corpus.size();
    int numSourcePositions = corpus.numSourcePositions();
    int numTargetPositions = corpus.numTargetPositions();
    int srcLength = numSourcePositions + numSentences;
    srcBitext = new int[srcLength];
    f2e = new int[srcLength];
    int tgtLength = numTargetPositions + numSentences;
    tgtBitext = new int[tgtLength];
    e2f = new int[tgtLength];
    int srcOffset = 0;
    int tgtOffset = 0;
    for (AlignedSentence sentence : corpus) {
      System.arraycopy(sentence.source, 0, srcBitext, srcOffset, sentence.sourceLength());
      System.arraycopy(sentence.f2e, 0, f2e, srcOffset, sentence.f2e.length);
      System.arraycopy(sentence.target, 0, tgtBitext, tgtOffset, sentence.targetLength());
      System.arraycopy(sentence.e2f, 0, e2f, tgtOffset, sentence.e2f.length);
      srcOffset += sentence.sourceLength();
      tgtOffset += sentence.targetLength();
      // Source points to target
      srcBitext[srcOffset] = toSentenceOffset(tgtOffset);
      // Target points to source
      tgtBitext[tgtOffset] = toSentenceOffset(srcOffset);
      ++srcOffset;
      ++tgtOffset;
    }
    vocabulary = corpus.getVocabulary();
    timer.mark("Corpus loading");
    logger.info("Done loading corpus: {}", timer);
  }

  /**
   * Encoding of bitext pointers.
   * 
   * @param corpusPosition
   * @return
   */
  private static int toSentenceOffset(int corpusPosition) {
    return -1 * (corpusPosition + 1);
  }
  
  /**
   * Decoding of bitext pointers.
   * 
   * @param offset
   * @return
   */
  private static int fromSentenceOffset(int offset) {
    return (-1 * offset) - 1;
  }
  
  /**
   * Create suffix arrays for the parallel corpus.
   */
  public void build() {
    logger.info("Building suffix arrays...");
    TimeKeeper timer = TimingUtils.start();
    int numSourcePositions = srcBitext.length - numSentences;
    srcSuffixArray = build(srcBitext, numSourcePositions);
    if (srcSuffixArray.length != numSourcePositions) throw new RuntimeException();
    timer.mark("Source array");
    int numTargetPositions = tgtBitext.length - numSentences;
    tgtSuffixArray = build(tgtBitext, numTargetPositions);
    if (tgtSuffixArray.length != numTargetPositions) throw new RuntimeException();
    timer.mark("Target array");
    logger.info("Done constructing suffix arrays: {}", timer);
  }
  
  /**
   * Sort the bitext in parallel.
   * 
   * @param bitext
   * @param numPositions
   * @return
   */
  private int[] build(final int[] bitext, int numPositions) {
    return IntStream.range(0, bitext.length).parallel().boxed().sorted((x,y) -> {
      // Compare suffixes
      int xPos = x, yPos = y, xId = bitext[x], yId = bitext[y];
      
      // Check to see if these points are sentence boundaries
      if (xId < 0 && yId < 0) {
        return 0;
      } else if (xId < 0) {
        // Say that sentence boundaries are longer than everything else.
        // They will be pushed to the end of the stream so that limit() can filter them.
        return 1;
      } else if (yId < 0) {
        return -1;
      }
            
      while(xId >= 0 && yId >= 0) {
        if (xId == yId) {
          xId = bitext[++xPos];
          yId = bitext[++yPos];
        } else {
          // Lexicographic sort
          return vocabulary.get(xId).compareTo(vocabulary.get(yId));
        }
      }
      
      // Compare lengths
      int xLength = xPos - x + (xId < 0 ? 0 : 1);
      int yLength = yPos - y + (yId < 0 ? 0 : 1);
      return xLength - yLength;
      
    }).limit(numPositions).mapToInt(i -> i).toArray();
  }

  /**
   * Print the suffix array.
   * 
   * @param isSource
   * @param out
   */
  public void print(boolean isSource, PrintWriter out) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    int[] bitext = isSource ? this.srcBitext : this.tgtBitext;
    for (int i = 0; i < sa.length; ++i) {
      StringBuilder sb = new StringBuilder();
      sb.append(i).append(": ");
      for (int corpusPos = sa[i]; bitext[corpusPos] >= 0; ++corpusPos) {
        if (corpusPos != sa[i]) sb.append(" ");
        sb.append(vocabulary.get(bitext[corpusPos]));
      }
      out.println(sb.toString());
    }
    out.flush();
  }
  
  /**
   * Find all source spans up to dimension == 3.
   * 
   * TODO(spenceg) Lopez reports finding a few order=5 n-grams of high frequency
   * so maybe generalize this lookup.
   * 
   * @param sampleSize
   * @param minOccurrences
   */
  public Map<Span,SuffixArraySample> lookupFrequentSourceNgrams(int sampleSize, int minOccurrences) {
    if (sampleSize >= minOccurrences) throw new IllegalArgumentException();
    if (srcSuffixArray.length == 0) return Collections.emptyMap();
    logger.info("Building query cache with threshold {}", minOccurrences);
    Map<Span,SuffixArraySample> queryCache = new HashMap<>(1000);
    int nCnt = 1, nnCnt = 1, nnnCnt = 1;
    int nStart = 0, nnStart = 0, nnnStart = 0;
    Suffix firstSuffix = new Suffix(srcSuffixArray[0], true);
    Span nSpan = new Span(firstSuffix, 1), 
        nnSpan = new Span(firstSuffix, 2), 
        nnnSpan = new Span(firstSuffix, 3);
    for (int i = 1; i < srcSuffixArray.length; ++i) {
      Suffix suffix = new Suffix(srcSuffixArray[i], true);
      Span nSpanThis = new Span(suffix, 1);
      Span nnSpanThis = new Span(suffix, 2);
      Span nnnSpanThis = new Span(suffix, 3);
      nCnt = checkSpan(nSpan, nSpanThis, nStart, i, nCnt, minOccurrences, sampleSize, queryCache);
      if (nCnt == 1) {
        nStart = i;
        nSpan = nSpanThis;
      }
      nnCnt = checkSpan(nnSpan, nnSpanThis, nnStart, i, nnCnt, minOccurrences, sampleSize, queryCache);
      if (nnCnt == 1) {
        nnStart = i;
        nnSpan = nnSpanThis;
      }
      nnnCnt = checkSpan(nnnSpan, nnnSpanThis, nnnStart, i, nnnCnt, minOccurrences, sampleSize, queryCache);
      if (nnnCnt == 1) {
        nnnStart = i;
        nnnSpan = nnnSpanThis;
      }
    };
    logger.info("Query cache size: {}", queryCache.size());
    
    logger.info("Creating target unigram caches for the count() function...");
    this.tgtCountLBCache = new int[vocabulary.size()];
    Arrays.fill(tgtCountLBCache, -1);
    this.tgtCountUBCache = new int[vocabulary.size()];
    Arrays.fill(tgtCountUBCache, -1);
    int lastId = tgtBitext[tgtSuffixArray[0]];
    
    for (int i = 0; i < tgtSuffixArray.length; ++i) {
      int tgtId = tgtBitext[tgtSuffixArray[i]];
      assert tgtId >= 0;
      if (tgtCountLBCache[tgtId] < 0) {
        tgtCountLBCache[tgtId] = i;
      }
      if (lastId != tgtId) {
        tgtCountUBCache[lastId] = i-1;
        assert tgtCountUBCache[lastId] >= tgtCountLBCache[lastId] : String.format("%d %d %d", i, lastId, tgtId);
      }
      lastId = tgtId;
    }
    
    // final update
    tgtCountUBCache[lastId] = tgtSuffixArray.length;
    assert tgtCountUBCache[lastId] >= tgtCountLBCache[lastId] : String.format("%d %d final", tgtSuffixArray.length, lastId);
    
    logger.info("Finished building count() cache.");
    
    return queryCache;
  }
    
  private int checkSpan(Span currentSpan, Span nextSpan, int startSa, int endSa, int cnt, 
      int ruleCacheThreshold, int sampleSize, Map<Span, SuffixArraySample> queryCache) {
    if (currentSpan != null && currentSpan.equals(nextSpan)) {
      return cnt + 1;
      
    } else if (cnt > ruleCacheThreshold) {
      int numHits = endSa - startSa;
      final int stepSize = (numHits < sampleSize) ? 1 : numHits / sampleSize;
      assert stepSize > 0;
      final List<SentencePair> hits = new ArrayList<>(sampleSize);
      for (int i = startSa; i < endSa && hits.size() < sampleSize; i += stepSize) {
        int corpusPosition = srcSuffixArray[i];
        assert srcBitext[corpusPosition] >= 0;
        hits.add(new SentencePair(corpusPosition));
      }
      queryCache.put(currentSpan, new SuffixArraySample(hits, startSa, endSa-1));
    }
    return 1;
  }

  /**
   * Identifies a span for caching.
   * 
   * @author Spence Green
   *
   */
  public class Span {
    public final int[] tokens;
    private final int hashCode;
    private Span(Suffix suffix, int order) {
      int[] tokens = new int[order];
      for (int i = 0; i < order; ++i) {
        int id = suffix.get(i);
        if (id >= 0) {
          tokens[i] = id;
        } else {
          tokens = new int[0];
          break;
        }
      }
      this.tokens = tokens;
      this.hashCode = MurmurHash2.hash32(tokens, tokens.length, 1);
    }
    @Override
    public int hashCode() {
      return hashCode;
    }
    @Override
    public boolean equals(Object o) {
      if (this == o) {
        return true;
      } else if (! (o instanceof Span)) {
        return false;
      } else {
        Span otherSpan = (Span) o;
        return Arrays.equals(this.tokens, otherSpan.tokens);
      }
    }
    @Override
    public String toString() {
      return Arrays.stream(tokens).mapToObj(tokenId -> vocabulary.get(tokenId))
          .collect(Collectors.joining(" "));
    }
  }

  /**
   * The number of segments in the underlying corpus.
   * 
   * @return
   */
  public int numSentences() { return numSentences; }

  public int sourceSASize() { return srcSuffixArray.length; }
  
  public int targetSASize() { return tgtSuffixArray.length; }
  
  /**
   * Find a lower or upper bound in the suffix array.
   * 
   * @param query
   * @param isSource
   * @param lowerBound
   * @param startFrom
   * @return
   */
  private int findBound(final int[] query, boolean isSource, boolean lowerBound, int startFrom) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    return findBound(query, isSource, lowerBound, startFrom, sa.length - 1);
  }
  
  private int findBound(final int[] query, boolean isSource, boolean lowerBound, int lo, int hi) {
    int[] sa = isSource ? this.srcSuffixArray : this.tgtSuffixArray;
    int low = lo;
    int high = hi;
    while(low <= high) {
      final int mid = (low + high) >>> 1;
      assert mid < sa.length;
      final int corpusPos = sa[mid];
      assert corpusPos >= 0;
      final Suffix midSuffix = new Suffix(corpusPos, isSource);
      final int cmp = midSuffix.compare(query);

      if (cmp < 0) {
        // Search left
        high = mid - 1;

      } else if (cmp > 0) {
        // Search right
        low = mid + 1;

      } else {
        // Check to see if this is the bound, then search
        if (lowerBound) {
          if (mid == 0) return 0;
          Suffix leftSuffix = new Suffix(sa[mid-1], isSource);
          int cmp2 = leftSuffix.compare(query);
          if (cmp2 > 0) return mid;
          // Search left
          assert cmp2 == 0;
          high = mid - 1;

        } else {
          if (mid == sa.length - 1) return mid;
          Suffix rightSuffix = new Suffix(sa[mid+1], isSource);
          int cmp2 = rightSuffix.compare(query);
          if (cmp2 < 0) return mid;
          // Search right
          assert cmp2 == 0;
          low = mid + 1;
        }
      }
    }
    // Key not found
    return -1;
  }

  /**
   * Wrapper object for suffix queries.
   * 
   * @author Spence Green
   *
   */
  private class Suffix {
    private final int pos;
    private final boolean isSource;
    public Suffix(int corpusPosition, boolean isSource) {
      this.pos = corpusPosition;
      this.isSource = isSource;
    }
    
    public int get(int i) {
      int[] bitext = isSource ? srcBitext : tgtBitext;
      int bitextPos = this.pos + i;
      if (bitextPos < 0 || bitextPos >= bitext.length || bitext[bitextPos] < 0) {
        return -1;
      } else {
        return bitext[bitextPos];
      }
    }

    public int compare(int[] query) {
      int[] bitext = isSource ? srcBitext : tgtBitext;
      boolean consumedQuery = false;
      for (int i = 0, j = pos; i < query.length && bitext[j] >= 0; ++i, ++j) {
        consumedQuery = (i == query.length-1);
        int xId = query[i];
        int yId = bitext[j];
        if (xId != yId) {
          return vocabulary.get(xId).compareTo(vocabulary.get(yId));
        }
      }

      // If query has been consumed, then this query is a prefix of this suffix, and this is a 
      // match. Otherwise, the query is longer than the suffix.
      return consumedQuery ? 0 : 1;
    }
    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      boolean seenEnd = false;
      for (int i = 0; ! seenEnd ; ++i) {
        int vocabId = get(i);
        if (vocabId >= 0) {
          if (i > 0) sb.append(" ");
          sb.append(vocabulary.get(vocabId));
        } else {
          seenEnd = true;
        }
      }
      return sb.toString();
    }
  }

  /**
   * Count of this sequence in either the source or target bitext.
   * 
   * @param tokens
   * @param isSource
   * @return
   */
  public int count(final int[] query, boolean isSource) {
    if (query.length == 0) return 0;
    if (!isSource && this.tgtCountLBCache != null && this.tgtCountUBCache != null) {
      // Use caches for fast target lookup
      final int tgtId = query[0];
      final int lo = tgtCountLBCache[tgtId];
      final int hi = tgtCountUBCache[tgtId];
      if (query.length == 1) {
        int count = hi - lo + 1;
        assert count > 0 : String.format("%d %d %d %d", tgtId, count, lo, hi);
        return count;
        
      } else {
        int lb = findBound(query, isSource, true, lo);
        if (lb >= 0) {
          int ub = findBound(query, isSource, false, lb, hi);
          assert ub >= 0 : String.format("%d %d %d %d %d", tgtId, lo, hi, lb, ub);
          return ub - lb + 1;
        }
      }
      
    } else {
      // Standard case
      int lb = findBound(query, isSource, true, 0);
      if (lb >= 0) {
        int ub = findBound(query, isSource, false, lb);
        assert ub >= 0;
        return ub - lb + 1;
      }
    }
    return 0;
  }
  
  /**
   * Return a sample of sentences from this suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples) {
    return sample(sourceQuery, maxSamples, 0, -1);
  }
  
  /**
   * Return a sample of sentences from this suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param exactMatch
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, boolean exactMatch) {
    return sample(sourceQuery, maxSamples, 0, -1, exactMatch);
  }


  /**
   * Return a sample of sentences from the suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, int minBound, int maxBound) {
    return sample(sourceQuery, maxSamples, minBound, maxBound, false);
  }
  
  /**
   * Return a sample of sentences from the suffix array.
   * 
   * @param sourceQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @param exactMatch
   * @return
   */
  public SuffixArraySample sample(final int[] sourceQuery, int maxSamples, int minBound, int maxBound, boolean exactMatch) {
    if (sourceQuery.length == 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int lb = maxBound > minBound ? findBound(sourceQuery, true, true, minBound, maxBound) :
      findBound(sourceQuery, true, true, minBound);
    if (lb < 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int ub = maxBound > lb ? findBound(sourceQuery, true, false, lb, maxBound) :
      findBound(sourceQuery, true, false, lb);
    assert ub >= 0;
    int numHits = ub - lb + 1;
    int stepSize = (numHits < maxSamples) ? 1 : numHits / maxSamples;
    assert stepSize > 0;
    // Stratified sample through the list of positions
    List<SentencePair> samples = new ArrayList<>(maxSamples);
    for (int i = lb; i <= ub && samples.size() < maxSamples; i += stepSize) {
      SentencePair sp = new SentencePair(srcSuffixArray[i]);
      if(!exactMatch || sp.sourceLength() == sourceQuery.length) samples.add(sp);
    }
    return new SuffixArraySample(samples, lb, ub);
  }
  
  /**
   * Return a sample from the target-side. Optimizations for pre-initializing the search
   * are not supported.
   * 
   * @param targetQuery
   * @param maxSamples
   * @param minBound
   * @param maxBound
   * @return
   */
  public SuffixArraySample sampleTarget(final int[] targetQuery, int maxSamples) {
    if (targetQuery.length == 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int lb = findBound(targetQuery, false, true, 0);
    if (lb < 0) return new SuffixArraySample(Collections.emptyList(), -1, -1);
    int ub = findBound(targetQuery, false, false, lb);
    assert ub >= 0;
    int numHits = ub - lb + 1;
    int stepSize = (numHits < maxSamples) ? 1 : numHits / maxSamples;
    assert stepSize > 0;
    // Stratified sample through the list of positions
    List<SentencePair> samples = new ArrayList<>(maxSamples);
    for (int i = lb; i <= ub && samples.size() < maxSamples; i += stepSize) {
      samples.add(new SentencePair(tgtSuffixArray[i], true));
    }
    return new SuffixArraySample(samples, lb, ub);
  }

  /**
   * A sampled sentence with an associated pointer to the left edge of
   * the query sequence.
   * 
   * @author Spence Green
   *
   */
  public class SentencePair {
    
    public final int wordPosition;
    
    // TODO(spenceg) The character offset would yield a sentence id for e.g., bitext tuning.
//    public final int sentenceId;
    
    public final int srcStartInclusive;
    private final int srcEndExclusive;
    private final int tgtStartInclusive;
    private final int tgtEndExclusive;
    
    private SentencePair(int corpusPosition) {
      // Find source span
      int j = corpusPosition;
      assert srcBitext[j] >= 0;
      // Walk forward
      while (srcBitext[j] >= 0) j++;
      srcEndExclusive = j;
      // Walk backward
      j = corpusPosition - 1;
      while (j >= 0 && srcBitext[j] >= 0) j--;
      srcStartInclusive = j + 1;
      assert corpusPosition >= srcStartInclusive : String.format("%d %d", corpusPosition, srcStartInclusive);
      
      // Find the target span
      tgtStartInclusive = j == -1 ? 0 : fromSentenceOffset(srcBitext[j]) + 1;
      tgtEndExclusive = fromSentenceOffset(srcBitext[srcEndExclusive]);
      assert tgtStartInclusive < tgtEndExclusive : String.format("tgt: %d %d", tgtStartInclusive, 
          tgtEndExclusive);
      assert tgtEndExclusive > 0 : String.valueOf(tgtEndExclusive);
      assert fromSentenceOffset(tgtBitext[tgtEndExclusive]) == srcEndExclusive : String.format("%d %d", 
          fromSentenceOffset(tgtBitext[tgtEndExclusive]), srcEndExclusive);
      
      // Set the start of the query
      wordPosition = corpusPosition - srcStartInclusive;
    }
    
    // Actually, this always creates a sentence pair from a target example
    // But add the additional parameter so that there are two different constructors
    private SentencePair(int corpusPosition, boolean isTarget) {
      // Find source span
      int j = corpusPosition;
      assert tgtBitext[j] >= 0;
      // Walk forward
      while (tgtBitext[j] >= 0) j++;
      tgtEndExclusive = j;
      // Walk backward
      j = corpusPosition - 1;
      while (j >= 0 && tgtBitext[j] >= 0) j--;
      tgtStartInclusive = j + 1;
      assert corpusPosition >= tgtStartInclusive : String.format("%d %d", corpusPosition, tgtStartInclusive);
      
      // Find the target span
      srcStartInclusive = j == -1 ? 0 : fromSentenceOffset(tgtBitext[j]) + 1;
      srcEndExclusive = fromSentenceOffset(tgtBitext[tgtEndExclusive]);
      assert srcStartInclusive < srcEndExclusive : String.format("tgt: %d %d", srcStartInclusive, 
          srcEndExclusive);
      assert srcEndExclusive > 0 : String.valueOf(srcEndExclusive);
      assert fromSentenceOffset(srcBitext[srcEndExclusive]) == tgtEndExclusive : String.format("%d %d", 
          fromSentenceOffset(srcBitext[srcEndExclusive]), tgtEndExclusive);
      
      // Set the start of the query
      wordPosition = corpusPosition - tgtStartInclusive;
    }
    
    public int sourceLength() {
      return srcEndExclusive - srcStartInclusive;
    }
    
    public int targetLength() {
      return tgtEndExclusive - tgtStartInclusive;
    }
    
    public int source(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < 0 || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return srcBitext[bitextPos];
    }
    
    public int target(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return tgtBitext[bitextPos];
    }
    
    public int[] f2e(int startInclusive, int endExclusive) {
      if (startInclusive >= endExclusive) throw new IllegalArgumentException();
      int bitextStartInclusive = srcStartInclusive + startInclusive;
      int bitextEndExclusive = srcStartInclusive + endExclusive;
      if (bitextStartInclusive < srcStartInclusive || bitextEndExclusive > srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return Arrays.copyOfRange(f2e, bitextStartInclusive, bitextEndExclusive);
    }
    
    public int[] f2e(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < srcStartInclusive || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return AlignedSentence.expand(f2e[bitextPos]);
    }
    
    public int[] e2f(int startInclusive, int endExclusive) {
      if (startInclusive >= endExclusive) throw new IllegalArgumentException();
      int bitextStartInclusive = tgtStartInclusive + startInclusive;
      int bitextEndExclusive = tgtStartInclusive + endExclusive;
      if (bitextStartInclusive < tgtStartInclusive || bitextEndExclusive > tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return Arrays.copyOfRange(e2f, bitextStartInclusive, bitextEndExclusive);
    }
    
    public int[] e2f(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return AlignedSentence.expand(e2f[bitextPos]);
    }
    
    public boolean isSourceUnaligned(int i) {
      int bitextPos = srcStartInclusive + i;
      if (bitextPos < srcStartInclusive || bitextPos >= srcEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return f2e[bitextPos] == 0;
    }
    
    public boolean isTargetUnaligned(int i) {
      int bitextPos = tgtStartInclusive + i;
      if (bitextPos < tgtStartInclusive || bitextPos >= tgtEndExclusive) throw new ArrayIndexOutOfBoundsException();
      return e2f[bitextPos] == 0;
    }
    
    public ParallelSuffixArrayEntry getParallelEntry() {
      return new ParallelSuffixArrayEntry(this, vocabulary);
    }
    
    @Override
    public String toString() {
      return this.getParallelEntry().toString();
    }
  }
  
  /**
   * A struct to hold the result of a sample of a suffix array.
   * 
   * @author Spence Green
   *
   */
  public static class SuffixArraySample {
    public final List<SentencePair> samples;
    public final int lb;
    public final int ub;
    public SuffixArraySample(List<SentencePair> q, int lb, int ub) {
      this.samples = q;
      this.lb = lb;
      this.ub = ub;
    }
    public int size() { return samples.size(); }
    @Override
    public String toString() {
      return String.format("bounds: %d/%d size: %d", lb, ub, samples.size());
    }
  }
}