python source code of clusters

ariba-master
- install_dependencies.sh
- AUTHORS
- _config.yml
- third_party
  - minimap-0.2
    - minimap.1
    - example.c
    - bseq.h
    - kdq.h
    - kvec.h
    - sdust.c
    - bseq.c
    - minimap.h
    - kthread.c
    - misc.c
    - index.c
    - kseq.h
    - sdust.h
    - map.c
    - README.md
    - khash.h
    - main.c
    - ksort.h
    - LICENSE.txt
    - sketch.c
  - fermi-lite-0.1
    - htab.c
    - rle.c
    - bfc.c
    - example.c
    - bubble.c
    - kvec.h
    - rope.c
    - mag.h
    - htab.h
    - fml.h
    - bseq.c
    - test
      - MT-simu.fq.gz
    - kmer.h
    - mrope.c
    - rld0.h
    - kstring.h
    - rle.h
    - kthread.c
    - mrope.h
    - misc.c
    - kseq.h
    - README.md
    - rope.h
    - internal.h
    - khash.h
    - rld0.c
    - ksw.c
    - ksw.h
    - .gitignore
    - ksort.h
    - LICENSE.txt
    - mag.c
    - unitig.c
- LICENSE
- CHANGELOG.md
- setup.py
- .travis.yml
- README.md
- scripts
  - ariba
- ariba
  - card_record.py
  - refdata_query.py
  - external_progs.py
  - flag.py
  - sequence_metadata.py
  - ref_genes_getter.py
  - pubmlst_getter.py
  - mapping.py
  - faidx.py
  - aln_to_metadata.py
  - summary.py
  - report.py
  - ref_preparer.py
  - scaffold_graph.py
  - report_filter.py
  - link.py
  - mlst_profile.py
  - cdhit.py
  - assembly_variants.py
  - assembly.py
  - reference_data.py
  - clusters.py
  - ref_seq_chooser.py
  - summary_cluster_variant.py
  - vfdb_parser.py
  - megares_data_finder.py
  - samtools_variants.py
  - summary_cluster.py
  - test_run_data
    - reads_2.fq
    - ref_seqs.fa
    - ref_fasta_to_make_reads_from.fa
    - metadata.tsv
    - reads_1.fq
  - ext
    - minimap_ariba.cpp
    - fml-asm_ariba.cpp
    - vcfcall_ariba.cpp
  - common.py
  - tasks
    - pubmlstget.py
    - flag.py
    - prepareref.py
    - micplot.py
    - expandflag.py
    - run.py
    - reportfilter.py
    - getref.py
    - summary.py
    - version.py
    - refquery.py
    - prepareref_tb.py
    - pubmlstspecies.py
    - aln2meta.py
    - __init__.py
    - test.py
  - __init__.py
  - read_filter.py
  - cluster.py
  - tests
    - mic_plotter_test.py
    - sequence_metadata_test.py
    - link_test.py
    - test_refdata_query.py
    - summary_test.py
    - faidx_test.py
    - scaffold_graph_test.py
    - assembly_compare_test.py
    - cdhit_test.py
    - versions_test.py
    - ref_seq_chooser_test.py
    - read_store_test.py
    - ref_preparer_test.py
    - mlst_profile_test.py
    - tb_test.py
    - common_test.py
    - summary_cluster_variant_test.py
    - assembly_test.py
    - summary_sample_test.py
    - histogram_test.py
    - pubmlst_getter_test.py
    - read_filter_test.py
    - ref_genes_getter_test.py
    - samtools_variants_test.py
    - cluster_test.py
    - mlst_reporter_test.py
    - summary_cluster_test.py
    - bam_parse_test.py
    - megares_data_finder_test.py
    - flag_test.py
    - data
      - bam_parse_test_parse.reads_2.fq
      - cluster_test_init_refdata.tsv
      - cluster_full_run_smtls_snp_varonly_nonc.tsv
      - clusters_minimap_reads_to_all_refs.out.pairs
      - reference_data_load_all_metadata_tsvs.1.tsv
      - cluster_test_init_no_reads_2
        genes.fa
        reads_1.fq
      - reference_data_sequence_type.in.tsv
      - samtools_variants_test_get_variants.read_depths.gz.tbi
      - pubmlst_ref_prepare.test_load_fa_and_clusters.in
        profile.txt
        gene2.tfa
        gene1.tfa
      - assembly_variants_one_var_one_ctg_cdg.tsv
      - assembly_test_scaffold_with_sspace_reads_2.fq
      - cluster_full_run_smtls_snp_varonly_nonc.fa
      - aln_to_metadata_run_noncoding.out.tsv
      - reference_data_filter_bad_data.in.fa
      - clusters_test_dummy_reads_1.fq
      - reference_data_load_all_fasta_files.in.2
      - cdhit_test_run_get_clusters_from_dict.in.fa
      - reference_data_init_fails.empty.tsv
      - clusters_minimap_reads_to_all_refs.out.hist
      - mapping_test_bowtie2_reads_1.fq
      - clusters_test_bam_to_clusters_reads.db.fa.fai
      - mlst_reporter.all_present_perfect.report.in.tsv
      - reference_data_load_metadata_tsv.tsv
      - read_store_test_sort_file.out
      - reference_data_test_cluster_with_cdhit.in.fa
      - cluster_full_run_smtls_snp_varonly_gene_2.fa
      - mic_plotter_to_boxplot_tsv.antibio1.no.tsv
      - faidx_test_write_fa_subset.in.fa
      - cluster_full_run_smtls_snp_varonly_nonc_no_snp.fa
      - cluster_test_full_run_assembly_fail.in.fa
      - graph_test_update_from_sam.ref.fa
      - bam_parse_test_parse.reads_1.fq
      - clusters_test_load_minimap_files.cluster2representative
      - reference_data_init_fails.empty.fa
      - reference_data_sequence.in.fa
      - pubmlst_rename_seqs.expected.fa
      - report_filter_test_load_report_good.tsv
      - clusters_test_load_data_info_file
      - reference_data_write_metadata_tsv.expected.tsv
      - ref_preparer_test_run_noncoding_checks.out
        02.cdhit.clusters.tsv
        00.rename_info
        02.cdhit.gene.varonly.fa
        02.cdhit.all.fa
        02.cdhit.noncoding.varonly.fa
        02.cdhit.clusters.pickle
        02.cdhit.noncoding.fa
        01.filter.check_genes.log
        01.filter.check_metadata.tsv
        00.info.txt
        00.version_info.txt
        01.filter.check_metadata.log
        02.cdhit.gene.fa
        01.filter.check_noncoding.log
      - clusters_test_write_mlst_reports.out.details.tsv
      - refcheck_test_check_too_short.fa
      - clusters_test_sam_pair_to_insert_ref.fa.smi
      - graph_test_update_from_sam.ref.fa.fai
      - assembly_variants_test_get_mummer_variants.none.snps
      - assembly_test_assemble_with_spades_reads_2.fq
      - reference_data_cluster_w_cdhit_nocluster.in.fa
      - refcheck_test_fix_in.fa
      - read_filter_test_run.in.reads_2.fq
      - bam_parse_test_update_unmapped_mates_from_sam.reads_1.fq
      - refcheck_test_check_not_gene.fa
      - reference_data_filter_bad_data_metadata.in.tsv
      - read_store_test_get_reads.expected.reads_subset.1.fq
      - cluster_test_make_reads_for_assembly.in1.fq
      - cluster_test_full_run_ok_variants_only
        reads_2.fq
        reads_1.fq
        references.fa
      - clusters_minimap_reads_to_all_refs.clstrs.tsv
      - clusters_test_load_minimap_insert_histogram.in
      - cluster_test_full_run_choose_ref_fail
        reads_2.fq
        reads_1.fq
        references.fa
      - summary_to_matrix.2.tsv
      - assembly_test_assemble_with_spades_reads_1.fq
      - summary_sample_test_var_groups.tsv
      - samtools_variants_test_get_variants.read_depths.gz
      - refcheck_test_fix_out.log
      - best_seq_chooser_total_alignment_score_reads_2.fq
      - pubmlst_rename_seqs.in.fa
      - dummy.bam
      - mlst_reporter.one_gene_missing.in.tsv
      - reference_data_sequence.in.tsv
      - report_filter_test_init_good.tsv
      - test_common_cat_files.in.1
      - reference_data_remove_bad_noncoding.in.fa
      - summary_test_write_distance_matrix.distances
      - cluster_test_full_run_choose_ref_fail.in.tsv
      - samtools_variants_make_vcf_and_depths_files.for_reads.fa
      - faidx_test_write_fa_subset.in.fa.fai
      - clusters_test_sam_pair_to_insert_ref.fa.sma
      - ref_seq_chooser_full_run_not_in_cluster.contigs.fa
      - link_test_init.reads.no_link_2.fq
      - link_test_init.reads_2.fq
      - link_test_init.reads_1.fq
      - cluster_test_full_run_assembly_fail.in.tsv
      - cluster_full_run_smtls_snp_varonly_gene_no_snp
        reads_2.fq
        reads_1.fq
        references.fa
      - reference_data_load_fasta_file.fa
      - cluster_test_full_run_insert_codon.fa
      - reference_data_remove_bad_noncoding.in.tsv
      - assembly_variants_test_get_variants_presence_absence.snps
      - cluster_test_full_run_ref_not_in_cluster.in.fa
      - summary_gather_unfiltered_output_data.in.2.tsv
      - cluster_test_full_run_ok_non_coding.metadata.tsv
      - assembly_test_scaffold_with_sspace_contigs.fa
      - cdhit_test_get_clusters_from_bak_file.in
      - clusters_write_tb_resistance_calls_json.out.json
      - mlst_reporter.all_present_perfect.report.out.tsv
      - clusters_test_write_gene_fa.out.fa
      - mlst_profile_test.init.profile.tsv
      - tb_write_prepareref_metadata_file.tsv
      - samtools_variants_test_get_read_depths.gz.tbi
      - mapping_test_bowtie2_reads_2.fq
      - read_store_test_sort_file.in
      - summary_sample_test_load_file.in.tsv
      - cluster_full_run_smtls_known_snp_presabs_nonc
        reads_2.fq
        reads_1.fq
        references.fa
      - cluster_full_run_known_smtls_snp_presabs_gene
        reads_2.fq
        reads_1.fq
        references.fa
      - mic_plotter_to_boxplot_tsv.antibio2.no.no_combinations.tsv
      - aln_to_metadata_run_coding.out.fa
      - refcheck_test_check_spaces_in_name.fa
      - best_seq_chooser_total_alignment_score_ref_seqs.fa
      - cluster_full_run_smtls_snp_varonly_gene.fa
      - ref_preparer_test_run.in.3.fa
      - samtools_variants_test_variants_in_coords.vcf
      - reference_data_test_write_cluster_allocation_file.expected
      - cluster_test_full_run_smtls_snp_varonly_nonc.tsv
      - mlst_profile_test.init_multiple_extra_columns.profile.tsv
      - aln_to_metadata_load_vars_file_bad.2.tsv
      - reference_data_load_input_check_seq_names.bad.fa.1
      - cluster_test_full_run_assembly_fail
        reads_2.fq
        reads_1.fq
        references.fa
      - clusters_test_sam_pair_to_insert_reads_1.fq
      - ref_preparer_test_run_all_noncoding.out
        02.cdhit.clusters.tsv
        02.cdhit.gene.varonly.fa
        02.cdhit.all.fa
        02.cdhit.noncoding.varonly.fa
        02.cdhit.clusters.pickle
        02.cdhit.noncoding.fa
        01.filter.check_genes.log
        01.filter.check_metadata.tsv
        00.info.txt
        00.version_info.txt
        01.filter.check_metadata.log
        00.auto_metadata.tsv
        02.cdhit.gene.fa
        01.filter.check_noncoding.log
      - read_filter_cdhit_clstr_to_reads.in.clstr
      - clusters_run_with_tb.reads_2.fq.gz
      - clusters_test_sam_pair_to_insert_reads_2.fq
      - reference_data_test_all_non_wild_type_variants.tsv
      - cluster_test_full_run_ref_not_in_cluster.all_refs.fa
      - link_test_init.reads.make_link_2.fq
      - cluster_test_full_run_ok_presence_absence
        reads_2.fq
        reads_1.fq
        references.fa
      - ref_genes_getter.fix_virulencefinder_fasta_file.out.fa
      - clusters_test_load_minimap_out_cluster_counts.in
      - aln_to_metadata_run_noncoding.out.cluster
      - clusters_test_load_minimap_files.properPairs
      - tb_report_to_resistance_dict.tsv
      - aln_to_metadata_load_aln_file.in.fa
      - cluster_full_run_smtls_snp_varonly_gene_no_snp.fa
      - graph_test_update_from_sam.reads_2.fq
      - bam_parse_test_update_unmapped_mates_from_sam.bam
      - graph_test_write_all_links_to_file.out
      - mapping_test_get_total_alignment_score.bam
      - link_test_init.reads.make_link.bam.bai
      - read_store_test_get_reads.expected.reads_1.fq
      - assembly_run_fermilite_fail.reads.fq
      - cluster_full_run_smtls_snp_presabs_gene.fa
      - ref_preparer_test_run.in.2.tsv
      - reference_data_test_write_seqs_to_fasta.in.fa
      - mlst_reporter.new_st.report.out.details.tsv
      - mapping_test_bowtie2_ref.fa
      - cluster_test_full_run_choose_ref_fail.in.fa
      - bam_parse_test_update_soft_clipped_from_sam.ref.fa
      - summary_test_whole_run.out.csv
      - assembly_assemble_with_fermilite_fails.expected.log
      - link_test_init.reads.no_link.bam.bai
      - ref_seq_chooser_full_run_not_in_cluster.clusterrefs.fa
      - cluster_test_full_run_delete_codon.tsv
      - cluster_test_full_run_ref_not_in_cluster
        reads_2.fq
        reads_1.fq
        references.fa
      - ref_seq_chooser_test_flanking.all_refs.fa
      - reference_data_rename_sequences_metadata.tsv
      - assembly_test_scaffold_with_sspace_reads_1.fq
      - cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
      - cluster_full_run_smtls_snp_varonly_gene_2.tsv
      - read_filter_test_run.expected.reads_1.fq
      - refdata_query_prepareref
        02.cdhit.clusters.tsv
        00.rename_info
        02.cdhit.gene.varonly.fa
        02.cdhit.all.fa
        02.cdhit.noncoding.varonly.fa
        02.cdhit.clusters.pickle
        02.cdhit.noncoding.fa
        01.filter.check_genes.log
        01.filter.check_metadata.tsv
        00.info.txt
        00.version_info.txt
        01.filter.check_metadata.log
        02.cdhit.gene.fa
        01.filter.check_noncoding.log
      - cluster_test_count_reads_1.fq
      - read_filter_test_run.in.reads_1.fq
      - cluster_test_full_run_partial_asmbly.tsv
      - ref_seq_chooser_matching_contig_pieces.coords
      - cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv
      - cluster_test_full_run_delete_codon
        reads_2.fq
        for_reads.fa
        reads_1.fq
        references.fa
      - aln_to_metadata_run_noncoding.out.fa
      - refcheck_test_check_too_long.fa
      - cluster_test_init_refdata.fa
      - mic_plotter_to_boxplot_tsv.antibio1.exclude.no_combinations.tsv
      - reference_data_init_ok.rename.tsv
      - megares_zip_parser_load_header_mappings.tsv
      - assembly_test_assemble_with_spades_fails_reads_2.fq
      - samtools_variants_test_get_read_depths.gz
      - assembly_test_fix_contig_orientation.out.fa
      - best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq
      - ref_seq_chooser_test_flanking.cluster_refs.fa
      - mic_plotter_to_boxplot_tsv.antibio1.yes.tsv
      - vfdb_parser_test_run.out.fa
      - samtools_variants_test_get_depths_at_position.ref.fa.fai
      - ref_preparer_test_run.in.2.fa
      - mlst_reporter.profile.in.tsv
      - clusters_test_dummy_db.fa
      - link_test_init.reads.no_link.bam
      - cluster_full_run_smtls_snp_presabs_nonc
        reads_2.fq
        reads_1.fq
        references.fa
      - ref_seq_chooser_full_run_best_match_not_in_cluster.allrefs.fa
      - bam_parse_test_sam_to_soft_clipped.bam
      - assembly_run_fermilite.reads.fq
      - summary_test_load_fofn
      - reference_data_test_write_seqs_to_fasta.expected.fa
      - summary_test_load_input_files.1.tsv
      - samtools_variants_make_vcf_and_depths_files.expect.vcf
      - ref_seq_chooser_full_run_contained_ref_seq.all_refs.fa
      - ref_preparer_test_run.in.1.fa
      - ref_genes_getter.fix_virulencefinder_fasta_file.in.fa
      - assembly_assemble_with_fermilite.expected.fa
      - clusters_cat_genes_match_ref.fa
      - reference_data_filter_bad_data.expected.log
      - reference_data_test_cluster_with_cdhit.expected.clusters.tsv
      - read_filter_test_run_cdhit_est_2d.ref.in.fa
      - clusters_test_dummy_reads_2.fq
      - reference_data_load_input_check_seq_names.bad.csv.1
      - report_filter_test_run.expected.tsv
      - mapping_test_bowtie2_remove_both_unmapped_reads_1.fq
      - clusters_test_bam_to_clusters_reads.db.fa.smi
      - ref_preparer_test_run.in.1.tsv
      - aln_to_metadata_run_noncoding.in.fa
      - reference_data_test_cluster_with_cdhit.in.tsv
      - clusters_minimap_reads_to_all_refs.out.clstr2rep
      - cluster_test_full_run_insert_codon
        reads_2.fq
        for_reads.fa
        reads_1.fq
        references.fa
      - clusters_test_write_catted_assemblies_fasta.expected.out.fa
      - reference_data_filter_bad_data.expected.check_metadata.tsv
      - mlst_profile_test.profile.tsv
      - cluster_test_full_run_ok_gene_start_mismatch.fa
      - reference_data_test_all_non_wild_type_variants.ref.fa
      - cluster_test_full_run_insert_codon.tsv
      - cluster_full_run_smtls_snp_varonly_nonc_no_snp.tsv
      - assembly_variants_one_var_one_ctg_noncdg.tsv
      - ref_preparer_test_fasta_to_metadata.noncoding.tsv
      - clusters_test_write_catted_assembled_genes_fasta.expected.out.fa
      - megares_zip_parse_extract_files_one_missing.zip
      - reference_data_load_input_check_seq_names.bad.csv.2
      - mapping_test_bowtie2_sorted.bam
      - best_seq_chooser_best_seq_ref.fa.fai
      - aln_to_metadata_make_cluster_file.out
      - cluster_full_run_known_smtls_snp_presabs_nonc.fa
      - read_filter_test_run_cdhit_est_2d.expected.clstr
      - mic_plotter_to_boxplot_tsv.antibio2.no.tsv
      - mic_plotter_to_boxplot_tsv.antibio2.exclude.tsv
      - link_test_init.ref.fa.fai
      - best_seq_chooser_best_seq_ref.fa
      - refdata_query_prepareref.in.tsv
      - mlst_reporter.new_st.report.out.tsv
      - bam_parse_test_parse.bam
      - report_flag_expander.run.in.tsv
      - reference_data_load_all_metadata_tsvs.2.tsv
      - read_filter_test_run.in.read_store
      - clusters_test_sam_pair_to_insert_ref.fa
      - reference_data_filter_bad_data.expected.all.fa
      - mlst_reporter.new_st.report.in.tsv
      - reference_data_cluster_w_cdhit_clstrs_file.expect.clstrs.tsv
      - cdhit_test_fake_run.non-unique.in.fa
      - cluster_test_count_reads_2.fq
      - clusters_load_ref_data_from_dir
        02.cdhit.all.fa
        02.cdhit.clusters.pickle
        01.filter.check_metadata.tsv
        00.info.txt
        00.params.json
      - ref_seq_chooser_matching_contig_pieces.ref.fa
      - clusters_test_write_mlst_reports.out.tsv
      - reference_data_test_rename_sequences.out
      - samtools_variants_test_get_depths_at_position.bam
      - samtools_variants_make_vcf_and_depths_files.expect.cov
      - mapping_test_bowtie2_remove_both_unmapped_reads.bam
      - ref_seq_chooser_full_run_contained_ref_seq.contigs.fa
      - reference_data_write_sequences_to_files.all.fa
      - cluster_full_run_smtls_snp_varonly_gene_no_snp.tsv
      - bam_parse_test_sam_to_soft_clipped.ref.fa
      - ref_seq_chooser_sequence_is_in_fasta_file.fa
      - assembly_variants_test_get_variants_presence_absence.fa
      - cluster_test_full_run_ref_not_in_cluster.in.tsv
      - read_store_test_get_reads.expected.reads_1.fa
      - clusters_test_load_minimap_proper_pairs.in
      - assembly_test_check_spades_log_file.log.bad
      - clusters_write_tb_resistance_calls_json.in.tsv
      - cluster_test_full_run_ok_non_coding.fa
      - bam_parse_test_update_soft_clipped_from_sam.reads.fq
      - cluster_test_full_run_ok_variants_only.present.metadata.tsv
      - clusters_minimap_reads_to_all_refs.ref.fa
      - cluster_test_full_run_no_reads_after_filtering
        reads_2.fq
        reads_1.fq
        references.fa
      - clusters_test_write_mlst_reports.ariba.report.tsv
      - samtools_variants_test_total_depth_per_contig
      - read_store_test_get_reads.in
      - summary_gather_unfiltered_output_data.in.1.tsv
      - cluster_test_full_run_partial_asmbly.fa
      - cdhit_test_load_user_clusters_file.bad1
      - cluster_full_run_smtls_snp_varonly_gene.tsv
      - cluster_full_run_smtls_known_snp_presabs_nonc.fa
      - vfdb_parser_test_run.out.tsv
      - clusters_run_with_tb.ref
        02.cdhit.clusters.tsv
        02.cdhit.gene.varonly.fa
        02.cdhit.all.fa
        02.cdhit.noncoding.varonly.fa
        02.cdhit.clusters.pickle
        02.cdhit.noncoding.fa
        01.filter.check_genes.log
        01.filter.check_metadata.tsv
        00.info.txt
        00.version_info.txt
        00.params.json
        01.filter.check_metadata.log
        02.cdhit.gene.fa
      - mlst_reporter.het_snp.out.tsv
      - reference_data_filter_bad_data_presence_absence.in.fa
      - read_store_test_get_reads.expected.reads_2.fa
      - mic_plotter_to_boxplot_tsv.antibio1.exclude.tsv
      - reference_data_sequence_type.in.fa
      - summary_test_whole_run.in.2.tsv
      - assembly_variants_one_var_one_ctg_noncdg.fa
      - assembly_assemble_with_fermilite.reads_1.fq
      - clusters_test_write_report.tsv
      - assembly_variants_test_get_mummer_variants.snp.snps
      - samtools_variants_make_vcf_and_depths_files.asmbly.fa.fai
      - cluster_test_full_run_ok_presence_absence.metadata.tsv
      - cluster_full_run_smtls_known_snp_presabs_nonc.tsv
      - cluster_full_run_smtls_snp_presabs_gene
        reads_2.fq
        reads_1.fq
        references.fa
      - cdhit_test_load_user_clusters_file.good
      - summary_test_init.fofn
      - bam_parse_test_sam_to_soft_clipped.reads.fq
      - mlst_reporter.het_snp.in.tsv
      - refcheck_test_fix_out.rename
      - assembly_assemble_with_fermilite.reads_2.fq
      - cdhit_test_run_get_clusters_from_dict.in.clusters
      - ref_seq_chooser_full_run_contained_ref_seq.cluster_refs.fa
      - mlst_reporter.het_snp.out.details.tsv
      - samtools_variants_test_get_depths_at_position.ref.fa
      - ref_seq_chooser_matching_contig_pieces.ctg.fa
      - summary_sample_test_column_summary_data.tsv
      - reference_data_load_input_check_seq_names.good.csv.2
      - test_common_cat_files.in.2
      - reference_data_cluster_w_cdhit_nocluster.expect.tsv
      - reference_data_cluster_w_cdhit_nocluster.in.tsv
      - ref_preparer_test_run.out
        02.cdhit.clusters.tsv
        02.cdhit.gene.varonly.fa
        02.cdhit.all.fa
        02.cdhit.noncoding.varonly.fa
        02.cdhit.clusters.pickle
        02.cdhit.noncoding.fa
        01.filter.check_genes.log
        01.filter.check_metadata.tsv
        00.info.txt
        00.version_info.txt
        01.filter.check_metadata.log
        02.cdhit.gene.fa
        01.filter.check_noncoding.log
      - link_test_init.ref.fa
      - cluster_test_full_run_ok_non_coding
        reads_2.fq
        reads_1.fq
        references.fa
      - ref_preparer_test_fasta_to_metadata.fa
      - samtools_variants_make_vcf_and_depths_files.asmbly.fa
      - mic_plotter_to_boxplot_tsv.antibio2.exclude.no_combinations.tsv
      - ref_preparer_test_fasta_to_metadata.coding.tsv
      - ref_seq_chooser_matching_contig_pieces.expect.fa
      - aln_to_metadata_load_vars_file_good.tsv
      - cluster_full_run_known_smtls_snp_presabs_nonc.tsv
      - report_filter_test_init_bad.tsv
      - reference_data_cluster_w_cdhit_clstrs_file.in.fa
      - report_flag_expander.run.out.tsv
      - samtools_variants_test_get_variant_positions_from_vcf.vcf
      - cluster_full_run_smtls_snp_presabs_nonc.tsv
      - reference_data_load_input_check_seq_names.good.csv.1
      - megares_zip_parse_extract_files_ok.zip
      - cluster_full_run_known_smtls_snp_presabs_gene.fa
      - pubmlst_ref_prepare.test_load_fa_and_clusters.expect.tsv
      - cluster_full_run_smtls_snp_presabs_gene.ref_for_reads.fa
      - read_store_test_get_reads.expected.reads.fa
      - cluster_test_full_run_multiple_vars.fa
      - reference_data_init_ok.in.tsv
      - reference_data_load_input_check_seq_names.good.fa.2
      - reference_data_load_rename_file.tsv
      - mlst_reporter.all_present_perfect.report.out.details.tsv
      - cluster_test_full_run_multiple_vars.tsv
      - cdhit_test_run.in.fa
      - samtools_variants_make_vcf_and_depths_files.bam
      - assembly_variants_one_var_one_ctg_cdg.fa
      - read_store_test_get_reads.expected.reads_2.fq
      - cluster_full_run_smtls_snp_varonly_nonc_no_snp
        reads_2.fq
        reads_1.fq
        references.fa
      - reference_data_cluster_w_cdhit_clstrs_file.in.clstrs.tsv
      - vfdb_parser_test_run.in.fa
      - cluster_test_full_run_delete_codon.fa
      - cluster_full_run_varonly.not_present.always_report.tsv
      - ref_seq_chooser_full_run_no_nucmer_match.clusterrefs.fa
      - clusters_test_load_minimap_out_cluster2representative.in
      - megares_zip_parser_write_files.expect.fa
      - cluster_test_full_run_partial_asmbly
        reads_2.fq
        reads_1.fq
        references.fa
      - assembly_variants_test_get_variants_variants_only.fa
      - cluster_test_init_no_reads_1
        reads_2.fq
        genes.fa
      - reference_data_rename_sequences.fa
      - assembly_run_fermilite_fails.expected.log
      - refdata_query_prepareref.in.fa
      - mic_plotter_load_summary_file.tsv
      - ref_seq_chooser_full_run_best_match_not_in_cluster.contigs.fa
      - ref_seq_chooser_full_run_best_match_is_in_cluster.clusterrefs.fa
      - mapping_test_sam_pair_to_insert.bam
      - aln_to_metadata_run_coding.out.cluster
      - read_store_test_get_reads.expected.reads_subset.2.fq
      - reference_data_cluster_w_cdhit_clstrs_file.in.meta.tsv
      - cluster_test_full_run_no_reads_after_filtering.in.tsv
      - clusters_test_bam_to_clusters_reads.db.fa.sma
      - read_store_test_get_reads.expected.reads.fq
      - clusters_minimap_reads_to_all_refs.out.clstr_count
      - cluster_test_make_reads_for_assembly.out2.fq
      - best_seq_chooser_total_alignment_score_ref_seqs.fa.fai
      - assembly_assemble_with_fermilite_fails.reads_2.fq
      - assembly_test_check_spades_log_file.log.good
      - assembly_assemble_with_fermilite_fails.reads_1.fq
      - summary_test_newick_from_dist_matrix.distances
      - assembly_test_fix_contig_orientation.ref.fa
      - clusters_test_write_gene_fa.db.fa
      - reference_data_test_remove_bad_genes.log
      - aln_to_metadata_run_coding.in.tsv
      - cluster_full_run_smtls_snp_varonly_gene
        reads_2.fq
        reads_1.fq
        references.fa
      - cluster_full_run_known_smtls_snp_presabs_nonc
        reads_2.fq
        reads_1.fq
        references.fa
      - aln_to_metadata_run_noncoding.in.tsv
      - cluster_test_make_reads_for_assembly.in2.fq
      - mlst_reporter.one_gene_missing.out.tsv
      - assembly_run_fermilite.expected.log
      - mlst_reporter.het_snps.out.details.tsv
      - test_common_cat_files.out
      - assembly_variants_test_get_variants_variants_only.snps
      - cluster_test_full_run_smtls_snp_varonly_nonc
        reads_2.fq
        reads_1.fq
        references.fa
      - reference_data_test_write_seqs_to_fasta.in.tsv
      - report_filter_test_load_report_bad.tsv
      - summary_test_load_input_files.2.tsv
      - reference_data_filter_bad_data.expected.metadata.tsv
      - reference_data_write_sequences_to_files.noncoding.fa
      - assembly_compare_parse_nucmer_coords_file.coords
      - reference_data_init_fails.in.fa
      - ref_seq_chooser_full_run_best_match_is_in_cluster.contigs.fa
      - tb_genbank_to_gene_coords.gb
      - refcheck_test_fix_out.removed.fa
      - best_seq_chooser_get_best_seq_by_alignment_score_ref.fa
      - read_filter_test_run.in.ref.fa
      - reference_data_filter_bad_data_variants_only.in.fa
      - ref_seq_chooser_full_run_no_nucmer_match.allrefs.fa
      - reference_data_init_ok.in.fa
      - best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai
      - ref_seq_chooser_test_flanking.expected_contigs.fa
      - refcheck_test_check_duplicate_name.fa
      - best_seq_chooser_total_alignment_score_reads_1.fq
      - mapping_test_sam_to_fastq.bam
      - clusters_test_dummy_db.tsv
      - bam_parse_test_update_unmapped_mates_from_sam.ref.fa
      - cdhit_test_fake_run.in.fa
      - cluster_test_full_run_ok_presence_absence.fa
      - mlst_reporter.one_gene_missing.out.details.tsv
      - assembly_test_assemble_with_spades_fails_reads_1.fq
      - reference_data_write_sequences_to_files.gene.varonly.fa
      - cdhit_test_load_user_clusters_file.bad2
      - clusters_test_write_mlst_reports.mlst_profile.tsv
      - cluster_full_run_smtls_snp_varonly_nonc
        reads_2.fq
        reads_1.fq
        references.fa
      - link_test_init.reads.make_link.bam
      - cluster_test_full_run_no_reads_after_filtering.in.fa
      - summary_to_matrix.1.tsv
      - assembly_test_parse_assembly_bam.bam
      - reference_data_filter_bad_data_non_coding.in.fa
      - ref_seq_chooser_full_run_best_match_not_in_cluster.clusterrefs.fa
      - cdhit_test_load_user_clusters_file.bad3
      - clusters_run_with_tb.reads_1.fq.gz
      - link_test_init.reads.make_link_1.fq
      - report_filter_test_write_report.tsv
      - ref_seq_chooser_full_run_no_nucmer_match.contigs.fa
      - megares_zip_parser_load_annotations.csv
      - assembly_test_fix_contig_orientation.in.fa
      - link_test_init.reads.no_link_1.fq
      - ref_seq_chooser_test_flanking.contigs.fa
      - refcheck_test_check_ok.fa
      - reference_data_write_metadata_tsv.tsv
      - read_store_test_clean.in
      - mlst_reporter.het_snps.out.tsv
      - cluster_test_full_run_ok_variants_only.fa
      - cluster_test_full_run_multiple_vars
        reads_2.fq
        for_reads.fa
        reads_1.fq
        references.fa
      - cluster_test_full_run_smtls_snp_varonly_nonc.fa
      - summary_sample_test_non_synon_variants.tsv
      - test_common_cat_files.in.3
      - mic_plotter_load_mic_file.tsv
      - assembly_assemble_with_fermilite.expected.log
      - ref_preparer_test_run.in.4.fa
      - reference_data_keep_seqs_from_dict.fa
      - ref_seq_chooser_full_run_not_in_cluster.allrefs.fa
      - reference_data_keep_seqs_from_dict.log
      - read_store_test_compress_and_index_file.in
      - reference_data_test_remove_bad_noncoding.log
      - aln_to_metadata_run_coding.out.tsv
      - reference_data_write_sequences_to_files.gene.fa
      - reference_data_load_all_fasta_files.in.1
      - mapping_test_bowtie2_remove_both_unmapped_reads_2.fq
      - cluster_full_run_smtls_snp_varonly_gene_2
        reads_2.fq
        reads_1.fq
        references.fa
      - cluster_test_init_no_refs_fa
        reads_2.fq
        reads_1.fq
      - reference_data_load_input_check_seq_names.bad.fa.2
      - megares_zip_parser_write_files
        megares_annotations_v1.01.csv
        megares_to_external_header_mappings_v1.01.tsv
        megares_database_v1.01.fasta
      - mic_plotter_to_boxplot_tsv.antibio1.no.no_combinations.tsv
      - cluster_full_run_smtls_snp_presabs_nonc.fa
      - cluster_full_run_smtls_snp_presabs_gene.tsv
      - reference_data_remove_bad_genes.in.fa
      - assembly_test_parse_assembly_bam.assembly.fa
      - best_seq_chooser_best_seq_reads_2.fq
      - cluster_test_full_run_ok_gene_start_mismatch
        reads_2.fq
        reads_1.fq
        references.fa
      - reference_data_write_sequences_to_files.noncoding.varonly.fa
      - mapping_test_bowtie2_unsorted.bam
      - read_filter_test_run.expected.reads_2.fq
      - clusters_test_load_minimap_files.clusterCounts
      - aln_to_metadata_run_coding.in.fa
      - mic_plotter_to_boxplot_tsv.antibio1.yes.no_combinations.tsv
      - refcheck_test_fix_out.fa
      - best_seq_chooser_best_seq_reads_1.fq
      - ref_preparer_test_run.in.4.tsv
      - clusters_test_write_gene_fa.db.fa.fai
      - cluster_full_run_known_smtls_snp_presabs_gene.ref_for_reads.fa
      - bam_parse_test_update_unmapped_mates_from_sam.reads_2.fq
      - mic_plotter_to_boxplot_tsv.antibio2.yes.no_combinations.tsv
      - mapping_test_bowtie2_ref.fa.fai
      - cluster_test_full_run_ok_gene_start_mismatch.ref_for_reads.fa
      - clusters_test_load_minimap_files.insertHistogram
      - summary_sample_test_column_names_tuples_and_het_snps.tsv
      - cdhit_test_run_get_clusters_from_dict_rename.in.fa
      - cluster_test_make_reads_for_assembly.out1.fq
      - bam_parse_test_parse.ref.fa
      - faidx_test_write_fa_subset.out.fa
      - report_filter_test_run.in.tsv
      - samtools_variants_test_get_variants.vcf
      - bam_parse_test_update_soft_clipped_from_sam.bam
      - mlst_reporter.het_snps.in.tsv
      - graph_test_update_from_sam.bam
      - reference_data_init_ok.params.json
      - reference_data_filter_bad_data.expected.check_metadata.log
      - pubmlst_getter.dbases.xml
      - cluster_full_run_known_smtls_snp_presabs_gene.tsv
      - reference_data_load_input_check_seq_names.good.fa.1
      - graph_test_update_from_sam.reads_1.fq
      - summary_test_whole_run.in.1.tsv
      - read_filter_test_run_cdhit_est_2d.reads.in.fa
      - mic_plotter_to_boxplot_tsv.antibio2.yes.tsv
      - reference_data_remove_bad_genes.in.tsv
      - assembly_run_fermilite.expected.fa
      - aln_to_metadata_load_vars_file_bad.1.tsv
      - megares_zip_parser_write_files.expect.tsv
      - samtools_variants_make_vcf_and_depths_files.expect.depths.gz
      - best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq
      - ref_seq_chooser_full_run_best_match_is_in_cluster.allrefs.fa
    - report_filter_test.py
    - pubmlst_ref_preparer_test.py
    - clusters_test.py
    - __init__.py
    - mapping_test.py
    - megares_zip_parser_test.py
    - card_record_test.py
    - external_progs_test.py
    - assembly_variants_test.py
    - vfdb_parser_test.py
    - sequence_variant_test.py
    - aln_to_metadata_test.py
    - reference_data_test.py
    - ncbi_getter_test.py
    - report_flag_expander_test.py
  - report_flag_expander.py
  - versions.py
  - pubmlst_ref_preparer.py
  - tb.py
  - histogram.py
  - mlst_reporter.py
  - sequence_variant.py
  - mic_plotter.py
  - tb_data
  - megares_zip_parser.py
  - bam_parse.py
  - summary_sample.py
  - assembly_compare.py
  - read_store.py
- Dockerfile
- .gitignore
- MANIFEST.in

import signal
import time
import os
import copy
import json
import tempfile
import pickle
import itertools
import sys
import multiprocessing
import pyfastaq
import minimap_ariba
from ariba import cluster, common, histogram, mlst_reporter, read_store, report, report_filter, reference_data, tb

class Error (Exception): pass

# passing shared objects (remaining_clusters) through here and thus making them
# explicit arguments to Pool.startmap when running this function. That seems to be
# a recommended safe transfer mechanism as opposed making them attributes of a
# pre-constructed 'obj' variable (although the docs are a bit hazy on that)
def _run_cluster(obj, verbose, clean, fails_dir, remaining_clusters, remaining_clusters_lock):
    failed_clusters = os.listdir(fails_dir)

    if len(failed_clusters) > 0:
        print('Other clusters failed. Will not start cluster', obj.name, file=sys.stderr)
        return obj

    if verbose:
        print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True)
    try:
        obj.run(remaining_clusters=remaining_clusters,remaining_clusters_lock=remaining_clusters_lock)
    except:
        print('Failed cluster:', obj.name, file=sys.stderr)
        with open(os.path.join(fails_dir, obj.name), 'w'):
            pass

    if verbose:
        print('Finished running cluster', obj.name, 'in directory', obj.root_dir, flush=True)

    if clean:
        if verbose:
            print('Deleting cluster dir', obj.root_dir, flush=True)
        if os.path.exists(obj.root_dir):
            try:
                common.rmtree(obj.root_dir)
            except:
                pass

    return obj


class Clusters:
    def __init__(self,
      refdata_dir,
      reads_1,
      reads_2,
      outdir,
      extern_progs,
      version_report_lines=None,
      assembly_kmer=21,
      assembly_coverage=100,
      threads=1,
      verbose=False,
      assembler='fermilite',
      spades_mode='rna',
      spades_options=None,
      max_insert=1000,
      min_scaff_depth=10,
      nucmer_min_id=90,
      nucmer_min_len=20,
      nucmer_breaklen=200,
      assembled_threshold=0.95,
      unique_threshold=0.03,
      max_gene_nt_extend=30,
      clean=True,
      tmp_dir=None,
    ):
        self.refdata_dir = os.path.abspath(refdata_dir)
        self.refdata, self.cluster_ids = self._load_reference_data_from_dir(refdata_dir)
        self.reads_1 = os.path.abspath(reads_1)
        self.reads_2 = os.path.abspath(reads_2)
        self.outdir = os.path.abspath(outdir)
        self.extern_progs = extern_progs
        self.clusters_tsv = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.clusters.tsv'))
        self.all_ref_seqs_fasta = os.path.abspath(os.path.join(refdata_dir, '02.cdhit.all.fa'))

        if version_report_lines is None:
            self.version_report_lines = []
        else:
            self.version_report_lines = version_report_lines

        self.clean = clean
        self.logs_dir = os.path.join(self.outdir, 'Logs')

        self.assembler = assembler
        self.assembly_kmer = assembly_kmer
        self.assembly_coverage = assembly_coverage
        self.spades_mode = spades_mode
        self.spades_options = spades_options

        self.cdhit_files_prefix = os.path.join(self.refdata_dir, 'cdhit')
        self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
        self.bam_prefix = os.path.join(self.outdir, 'map_reads_to_cluster_reps')
        self.bam = self.bam_prefix + '.bam'
        self.report_file_all_tsv = os.path.join(self.outdir, 'debug.report.tsv')
        self.report_file_filtered = os.path.join(self.outdir, 'report.tsv')
        self.mlst_reports_prefix = os.path.join(self.outdir, 'mlst_report')
        self.mlst_profile_file = os.path.join(self.refdata_dir, 'pubmlst.profile.txt')
        self.tb_resistance_calls_file = os.path.join(self.outdir, 'tb.resistance.json')
        self.catted_assembled_seqs_fasta = os.path.join(self.outdir, 'assembled_seqs.fa.gz')
        self.catted_genes_matching_refs_fasta = os.path.join(self.outdir, 'assembled_genes.fa.gz')
        self.catted_assemblies_fasta = os.path.join(self.outdir, 'assemblies.fa.gz')
        self.threads = threads
        self.verbose = verbose

        self.max_insert = max_insert

        self.insert_hist_bin = 10
        self.insert_hist = histogram.Histogram(self.insert_hist_bin)
        self.insert_size = None
        self.insert_sspace_sd = None
        self.insert_proper_pair_max = None

        self.min_scaff_depth = min_scaff_depth
        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen

        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold
        self.max_gene_nt_extend = max_gene_nt_extend

        self.cluster_to_dir = {}  # gene name -> abs path of cluster directory
        self.clusters = {}        # gene name -> Cluster object
        self.cluster_read_counts = {} # gene name -> number of reads
        self.cluster_base_counts = {} # gene name -> number of bases
        self.pool = None
        self.fails_dir = os.path.join(self.outdir ,'.fails')
        self.clusters_all_ran_ok = True

        for d in [self.outdir, self.logs_dir, self.fails_dir]:
            try:
                os.mkdir(d)
            except:
                raise Error('Error mkdir ' + d)
        if tmp_dir is None:
            if 'ARIBA_TMPDIR' in os.environ:
                tmp_dir = os.path.abspath(os.environ['ARIBA_TMPDIR'])
            elif 'TMPDIR' in os.environ:
                tmp_dir = os.path.abspath(os.environ['TMPDIR'])
            else:
                tmp_dir = self.outdir

        if not os.path.exists(tmp_dir):
            raise Error('Temporary directory ' + tmp_dir + ' not found. Cannot continue')

        if self.clean:
            self.tmp_dir_obj = tempfile.TemporaryDirectory(prefix='ariba.tmp.', dir=os.path.abspath(tmp_dir))
            self.tmp_dir = self.tmp_dir_obj.name
        else:
            self.tmp_dir_obj = None
            self.tmp_dir = os.path.join(self.outdir, 'clusters')
            try:
                os.mkdir(self.tmp_dir)
            except:
                raise Error('Error making directory ' + self.tmp_dir)

        if self.verbose:
            print('Temporary directory:', self.tmp_dir)

        for i in [x for x in dir(signal) if x.startswith("SIG") and x not in {'SIGCHLD', 'SIGCLD', 'SIGPIPE', 'SIGTSTP', 'SIGCONT'}]:
            try:
                signum = getattr(signal, i)
                signal.signal(signum, self._receive_signal)
            except:
                pass


    def _stop_pool(self):
        if self.pool is None:
            return
        self.pool.close()
        self.pool.terminate()
        while len(multiprocessing.active_children()) > 0:
            time.sleep(1)


    def _emergency_stop(self):
        self._stop_pool()
        if self.clean:
            try:
                self.tmp_dir_obj.cleanup()
            except:
                pass


    def _receive_signal(self, signum, stack):
        print('Stopping! Signal received:', signum, file=sys.stderr, flush=True)
        self._emergency_stop()
        sys.exit(1)


    @classmethod
    def _load_reference_data_info_file(cls, filename):
        data = {
            'genetic_code': None
        }

        with open(filename) as f:
            for line in f:
                key, val = line.rstrip().split('\t')
                if key in data:
                    data[key] = val

        if None in data.values():
            missing_values = [x for x in data if data[x] is None]
            raise Error('Error reading reference info file ' + filename + '. These values not found: ' + ','.join(missing_values))

        data['genetic_code'] = int(data['genetic_code'])
        return data


    @staticmethod
    def _load_reference_data_from_dir(indir):
        if not os.path.exists(indir):
            raise Error('Error loading reference data. Input directory ' + indir + ' not found. Cannot continue')

        fasta_file = os.path.join(indir, '02.cdhit.all.fa')
        metadata_file = os.path.join(indir, '01.filter.check_metadata.tsv')
        info_file = os.path.join(indir, '00.info.txt')
        parameters_file = os.path.join(indir, '00.params.json')
        clusters_pickle_file = os.path.join(indir, '02.cdhit.clusters.pickle')
        params = Clusters._load_reference_data_info_file(info_file)
        refdata = reference_data.ReferenceData(
            [fasta_file],
            [metadata_file],
            genetic_code=params['genetic_code'],
            parameters_file=parameters_file,
        )

        with open(clusters_pickle_file, 'rb') as f:
            cluster_ids = pickle.load(f)

        return refdata, cluster_ids


    def _map_and_cluster_reads(self):
        if self.verbose:
            print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True)

        minimap_prefix = 'minimap'

        self._minimap_reads_to_all_ref_seqs(
            self.clusters_tsv,
            self.all_ref_seqs_fasta,
            self.reads_1,
            self.reads_2,
            minimap_prefix,
            verbose=self.verbose
        )

        if self.verbose:
            print('Finished mapping\n')
            print('{:_^79}'.format(' Generating clusters '), flush=True)

        self.cluster_to_rep, self.cluster_read_counts, self.cluster_base_counts, self.insert_hist, self.proper_pairs = self._load_minimap_files(minimap_prefix, self.insert_hist_bin)
        self.cluster_to_dir = {x: os.path.join(self.tmp_dir, x) for x in self.cluster_to_rep}
        reads_file_for_read_store = minimap_prefix + '.reads'

        if len(self.cluster_read_counts):
            if self.verbose:
                filehandle = sys.stdout
            else:
                filehandle = None

            self.read_store = read_store.ReadStore(
              reads_file_for_read_store,
              os.path.join(self.outdir, 'read_store'),
              log_fh=filehandle
            )

        os.unlink(reads_file_for_read_store)

        if self.clean:
            for suffix in ['cluster2representative', 'clusterCounts', 'insertHistogram', 'properPairs']:
                filename = minimap_prefix + '.' + suffix
                try:
                    os.unlink(filename)
                except:
                    pass

        if self.verbose:
            print('Found', self.proper_pairs, 'proper read pairs from minimap')
            print('Total clusters to perform local assemblies:', len(self.cluster_to_dir), flush=True)


    @staticmethod
    def _minimap_reads_to_all_ref_seqs(clusters_tsv, ref_fasta, reads_1, reads_2, outprefix, verbose=False):
        got = minimap_ariba.minimap_ariba(clusters_tsv, ref_fasta, reads_1, reads_2, outprefix)
        if (got != 0):
            raise Error('Error running minimap. Cannot continue')


    @classmethod
    def _load_minimap_out_cluster2representative(cls, infile):
        cluster2rep = {}

        with open(infile) as f:
            for line in f:
                cluster, rep = line.rstrip().split('\t')
                cluster2rep[cluster] = rep

        return cluster2rep


    @classmethod
    def _load_minimap_out_cluster_counts(cls, infile):
        reads = {}
        bases = {}

        with open(infile) as f:
            for line in f:
                cluster, read, base = line.rstrip().split('\t')
                reads[cluster] = int(read)
                bases[cluster] = int(base)

        return reads, bases


    @classmethod
    def _load_minimap_insert_histogram(cls, infile, bin_size):
        hist = histogram.Histogram(bin_size)

        with open(infile) as f:
            for line in f:
                value, count = line.rstrip().split('\t')
                hist.add(int(value), count=int(count))

        return hist


    @classmethod
    def _load_minimap_proper_pairs(cls, infile):
        with open(infile) as f:
            for line in f:
                pairs = int(line.rstrip())
                break

        return pairs


    @staticmethod
    def _load_minimap_files(inprefix, hist_bin_size):
        cluster2rep = Clusters._load_minimap_out_cluster2representative(inprefix + '.cluster2representative')
        cluster_read_count, cluster_base_count = Clusters._load_minimap_out_cluster_counts(inprefix + '.clusterCounts')
        insert_hist = Clusters._load_minimap_insert_histogram(inprefix + '.insertHistogram', hist_bin_size)
        proper_pairs = Clusters._load_minimap_proper_pairs(inprefix + '.properPairs')
        return cluster2rep, cluster_read_count, cluster_base_count, insert_hist, proper_pairs


    def _set_insert_size_data(self):
        if len(self.insert_hist) == 0:
            return False
        else:
            (x, self.insert_size, pc95, self.insert_sspace_sd) = self.insert_hist.stats()
            self.insert_sspace_sd = min(1, self.insert_sspace_sd)
            self.insert_proper_pair_max = 1.1 * pc95
            if self.verbose:
                print('\nInsert size information from reads mapped to reference genes:')
                print('Insert size:', self.insert_size, sep='\t')
                print('Insert sspace sd:', self.insert_sspace_sd, sep='\t')
                print('Max insert:', self.insert_proper_pair_max, sep='\t')
                print()
            return True


    def _init_and_run_clusters(self):
        if len(self.cluster_to_dir) == 0:
            raise Error('Did not get any reads mapped to genes. Cannot continue')

        counter = 0
        cluster_list = []
        self.log_files = []

        # How the thread count withing each Cluster.run is managed:
        # We want to handle those cases where there are more total threads allocated to the application than there are clusters
        # remaining to run (for example,
        # there are only two references, and eight threads). If we keep the default thread value of 1 in cluster. Cluster,
        # then we will be wasting the allocated threads. The most simple approach would be to divide all threads equally between clusters
        # before calling Pool.map. Multithreaded external programs like Spades and Bowtie2 are then called with multiple threads. That should
        # never be slower than keeping just one thread in cluster.Cluster, except maybe in the extreme cases when (if)
        # a multi-threaded run of the external program takes longer wall-clock time than a single-threaded one.
        # However, this solution would always keep
        # Cluster.threads=1 if the initial number of clusters > number of total threads. This can result in inefficiency at the
        # tail of the Pool.map execution flow - when the clusters are getting finished overall, we are waiting for the completion of
        # fewer and fewer remaining
        # single-threaded cluster tasks while more and more total threads are staying idle. We mitigate this through the following approach:
        # - Create a shared Value object that holds the number of remaining clusters (remaining_clusters).
        # - Each Cluster.run decrements the remaining_clusters when it completes
        # - Cluster.run sets its own thread count to max(1,threads_total//remaining_clusters). This can be done as many times
        #   as needed at various points within Cluster.run (e.g. once before Spades is called, and again before Bowtie2 is called),
        #   in order to catch more idle threads.
        # This is a simple and conservative approach to adaptively use all threads at the tail of the map flow. It
        # never over-subscribes the threads, and it does not require any extra blocking within Cluster.run in order to
        # wait for threads becoming available.

        for cluster_name in sorted(self.cluster_to_dir):
            counter += 1

            if self.cluster_read_counts[cluster_name] <= 2:
                if self.verbose:
                    print('Not constructing cluster ', cluster_name, ' because it only has ', self.cluster_read_counts[cluster_name], ' reads (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
                continue

            if self.verbose:
                print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
            new_dir = self.cluster_to_dir[cluster_name]
            self.log_files.append(os.path.join(self.logs_dir, cluster_name + '.log'))

            cluster_list.append(cluster.Cluster(
                new_dir,
                cluster_name,
                self.refdata,
                all_ref_seqs_fasta=self.all_ref_seqs_fasta,
                fail_file=os.path.join(self.fails_dir, cluster_name),
                read_store=self.read_store,
                reference_names=self.cluster_ids[cluster_name],
                logfile=self.log_files[-1],
                assembly_coverage=self.assembly_coverage,
                assembly_kmer=self.assembly_kmer,
                assembler=self.assembler,
                max_insert=self.insert_proper_pair_max,
                min_scaff_depth=self.min_scaff_depth,
                nucmer_min_id=self.nucmer_min_id,
                nucmer_min_len=self.nucmer_min_len,
                nucmer_breaklen=self.nucmer_breaklen,
                reads_insert=self.insert_size,
                sspace_k=self.min_scaff_depth,
                sspace_sd=self.insert_sspace_sd,
                threads=1, # initially set to 1, then will adaptively self-modify while running
                assembled_threshold=self.assembled_threshold,
                unique_threshold=self.unique_threshold,
                max_gene_nt_extend=self.max_gene_nt_extend,
                spades_mode=self.spades_mode,
                spades_options=self.spades_options,
                clean=self.clean,
                extern_progs=self.extern_progs,
                threads_total=self.threads
            ))
        # Here is why we use proxy objects from a Manager process below
        # instead of simple shared multiprocessing.Value counter:
        # Shared memory objects in multiprocessing use tempfile module to
        # create temporary directory, then create temporary file inside it,
        # memmap the file and unlink it. If TMPDIR envar points to a NFS
        # mount, the final cleanup handler from multiprocessing will often
        # return an exception due to a stale NFS file (.nfsxxxx) from a shutil.rmtree
        # call. See help on tempfile.gettempdir() for how the default location of
        # temporary files is selected. The exception is caught in except clause
        # inside multiprocessing cleanup, and only a harmless traceback is printed,
        # but it looks very spooky to the user and causes confusion. We use
        # instead shared proxies from the Manager. Those do not rely on shared
        # memory, and thus bypass the NFS issues. The counter is accesses infrequently
        # relative to computations, so the performance does not suffer.
        # default authkey in the manager will be some generated random-looking string
        manager = multiprocessing.Manager()
        remaining_clusters = manager.Value('l',len(cluster_list))
        # manager.Value does not provide access to the internal RLock that we need for
        # implementing atomic -=, so we need to carry around a separate RLock object.
        remaining_clusters_lock = manager.RLock()
        try:
            if self.threads > 1:
                self.pool = multiprocessing.Pool(self.threads)
                cluster_list = self.pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose), itertools.repeat(self.clean), itertools.repeat(self.fails_dir),
                                                                   itertools.repeat(remaining_clusters),itertools.repeat(remaining_clusters_lock)))
                # harvest the pool as soon as we no longer need it
                self.pool.close()
                self.pool.join()
            else:
                for c in cluster_list:
                    _run_cluster(c, self.verbose, self.clean, self.fails_dir, remaining_clusters, remaining_clusters_lock)
        except:
            self.clusters_all_ran_ok = False

        if self.verbose:
            print('Final value of remaining_clusters counter:', remaining_clusters)
        remaining_clusters = None
        remaining_clusters_lock = None
        manager.shutdown()

        if len(os.listdir(self.fails_dir)) > 0:
            self.clusters_all_ran_ok = False

        self.clusters = {c.name: c for c in cluster_list}


    @staticmethod
    def _write_report(clusters_in, tsv_out):
        columns = copy.copy(report.columns)
        columns[0] = '#' + columns[0]

        f = pyfastaq.utils.open_file_write(tsv_out)
        print('\t'.join(columns), file=f)
        columns[0] = columns[0][1:]

        for seq_name in sorted(clusters_in):
            if clusters_in[seq_name].report_lines is None:
                continue

            for line in clusters_in[seq_name].report_lines:
                print(line, file=f)

        pyfastaq.utils.close(f)


    def _write_catted_assemblies_fasta(self, outfile):
        f = pyfastaq.utils.open_file_write(outfile)

        for gene in sorted(self.clusters):
            try:
                seq_dict = self.clusters[gene].assembly.sequences
            except:
                continue

            for seq_name in sorted(seq_dict):
                print(seq_dict[seq_name], file=f)

        pyfastaq.utils.close(f)


    def _write_catted_assembled_seqs_fasta(self, outfile):
        f = pyfastaq.utils.open_file_write(outfile)

        for gene in sorted(self.clusters):
            try:
                seq_dict = self.clusters[gene].assembly_compare.assembled_reference_sequences
            except:
                continue

            for seq_name in sorted(seq_dict):
                print(seq_dict[seq_name], file=f)

        pyfastaq.utils.close(f)


    def _write_catted_genes_matching_refs_fasta(self, outfile):
        f = pyfastaq.utils.open_file_write(outfile)

        for gene in sorted(self.clusters):
            if self.clusters[gene].assembly_compare is not None and self.clusters[gene].assembly_compare.gene_matching_ref is not None:
                seq = copy.copy(self.clusters[gene].assembly_compare.gene_matching_ref)
                seq.id += '.' + '.'.join([
                    self.clusters[gene].assembly_compare.gene_matching_ref_type,
                    str(self.clusters[gene].assembly_compare.gene_start_bases_added),
                    str(self.clusters[gene].assembly_compare.gene_end_bases_added)
                ])
                print(seq, file=f)

        pyfastaq.utils.close(f)


    def _clean(self):
        if self.clean:
            common.rmtree(self.fails_dir)

            try:
                self.tmp_dir_obj.cleanup()
            except:
                pass

            if self.verbose:
                print('Deleting Logs directory', self.logs_dir)
            common.rmtree(self.logs_dir)

            try:
                if self.verbose:
                    print('Deleting reads store files', self.read_store.outfile + '[.tbi]')
                self.read_store.clean()
            except:
                pass
        else:
            if self.verbose:
                print('Not deleting anything because --noclean used')


    @classmethod
    def _write_mlst_reports(cls, mlst_profile_file, ariba_report_tsv, outprefix, verbose=False):
        if os.path.exists(mlst_profile_file):
            if verbose:
                print('\nMaking MLST reports', flush=True)
            reporter = mlst_reporter.MlstReporter(ariba_report_tsv, mlst_profile_file, outprefix)
            reporter.run()


    @classmethod
    def _write_tb_resistance_calls_json(cls, ariba_report_tsv, outfile):
        calls = tb.report_to_resistance_dict(ariba_report_tsv)
        with open(outfile, 'w') as f:
            json.dump(calls, f, sort_keys=True, indent=4)


    def write_versions_file(self, original_dir):
        with open('version_info.txt', 'w') as f:
            print('ARIBA run with this command:', file=f)
            print(' '.join([sys.argv[0]] + sys.argv[1:]), file=f)
            print('from this directory:', original_dir, file=f)
            print(file=f)
            print(*self.version_report_lines, sep='\n', file=f)


    def run(self):
        try:
            self._run()
        except Error as err:
            self._emergency_stop()
            raise Error('Something went wrong during ariba run. Cannot continue. Error was:\n' + str(err))


    def _run(self):
        cwd = os.getcwd()
        try:
            os.chdir(self.outdir)
            self.write_versions_file(cwd)
            self._map_and_cluster_reads()
            self.log_files = None

            if len(self.cluster_to_dir) > 0:
                got_insert_data_ok = self._set_insert_size_data()
                if not got_insert_data_ok:
                    print('WARNING: not enough proper read pairs (found ' + str(self.proper_pairs) + ') to determine insert size.', file=sys.stderr)
                    print('This probably means that very few reads were mapped at all. No local assemblies will be run', file=sys.stderr)
                    if self.verbose:
                        print('Not enough proper read pairs mapped to determine insert size. Skipping all assemblies.', flush=True)
                else:
                    if self.verbose:
                        print('{:_^79}'.format(' Assembling each cluster '))
                        print('Will run', self.threads, 'cluster(s) in parallel', flush=True)
                    self._init_and_run_clusters()
                    if self.verbose:
                        print('Finished assembling clusters\n')
            else:
                if self.verbose:
                    print('No reads mapped. Skipping all assemblies', flush=True)
                print('WARNING: no reads mapped to reference genes. Therefore no local assemblies will be run', file=sys.stderr)

            if not self.clusters_all_ran_ok:
                raise Error('At least one cluster failed! Stopping...')

            if self.verbose:
                print('{:_^79}'.format(' Writing reports '), flush=True)
                print('Making', self.report_file_all_tsv)
            self._write_report(self.clusters, self.report_file_all_tsv)

            if self.verbose:
                print('Making', self.report_file_filtered)
            rf = report_filter.ReportFilter(infile=self.report_file_all_tsv)
            rf.run(self.report_file_filtered)

            if self.verbose:
                print()
                print('{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True)
                print(self.catted_assembled_seqs_fasta, 'and', self.catted_genes_matching_refs_fasta, flush=True)
            self._write_catted_assembled_seqs_fasta(self.catted_assembled_seqs_fasta)
            self._write_catted_genes_matching_refs_fasta(self.catted_genes_matching_refs_fasta)
            self._write_catted_assemblies_fasta(self.catted_assemblies_fasta)

            if self.log_files is not None:
                clusters_log_file = os.path.join(self.outdir, 'log.clusters.gz')
                if self.verbose:
                    print()
                    print('{:_^79}'.format(' Catting cluster log files '), flush=True)
                    print('Writing file', clusters_log_file, flush=True)
                common.cat_files(self.log_files, clusters_log_file)

            if self.verbose:
                print()
                print('{:_^79}'.format(' Cleaning files '), flush=True)
            self._clean()

            Clusters._write_mlst_reports(self.mlst_profile_file, self.report_file_filtered, self.mlst_reports_prefix, verbose=self.verbose)

            if 'tb' in self.refdata.extra_parameters and self.refdata.extra_parameters['tb']:
                Clusters._write_tb_resistance_calls_json(self.report_file_filtered, self.tb_resistance_calls_file)

            if self.clusters_all_ran_ok and self.verbose:
                print('\nAll done!\n')
        finally:
            os.chdir(cwd)