__author__ = 'hofmann' import os import tempfile import shutil import datetime import time from scripts.Validator.validator import Validator class ProjectFileFolderHandle(Validator): """ Dealing with file and folder locations related to the data produced """ _label = "ProjectFileFolderHandle" # Validate raw genomes # 0 Community design TRUE # 1 Move Genomes TRUE # 2 Simulate reads. # Copy distribution file / genome location file to project folder if directly given # Input project folder, unless previous steps done # Output tmp, to be archived # 3 Gold standard assembly # !! input tmp, unless previous steps done # output tmp, to be archived # 4 Anonymization # !! input tmp, unless previous steps done # output tmp, to be archived # 5 Archive # - Compression strength option? # - requires list of all that is to be archived # !! input tmp, unless previous steps done # output project folder # 0 disabled # 1 discard # 2 keep # 3 compress _TMP = True _HardDrive = False # (a) reads / (a) gsa / (a) pgsa _location_reads = [_HardDrive, _HardDrive] _location_gsa = [_HardDrive, _HardDrive] _location_pgsa = [_HardDrive, _HardDrive] # ################### # sub folder_names # ################### _folder_name_internal = "internal" # _folder_name_comunity_design = "comunity_design" _folder_name_distribution = "distributions" _folder_name_genomes = "source_genomes" # _folder_name_meta_data = "meta_data" # folder_name_simulated = "simulated_genomes" _folder_name_bam = "bam" # _folder_name_sam = "sam" _folder_name_reads = "reads" _folder_name_contigs = "contigs" # _folder_name_logfiles = "logfiles" _folder_name_sample = "sample_{id}" _sub_folders_sample = [_folder_name_bam, _folder_name_reads, _folder_name_contigs] _sub_folders_output = [_folder_name_internal, _folder_name_distribution, _folder_name_genomes] # ################### # file names # ################### _filename_genome_locations = "genome_locations.tsv" _filename_distribution = "distribution.txt" _filename_anonymous_reads = "anonymous_reads.fq" # filename_reads_anonymous_mapping = "reads_anonymous_mapping.tsv" _filename_gsa = "gsa.fasta" _filename_anonymous_gsa = "anonymous_gsa.fasta" # filename_gsa_anonymous_mapping = "gsa_anonymous_mapping.tsv" _filename_gsa_pooled = "gsa_pooled.fasta" _filename_anonymous_gsa_pooled = "anonymous_gsa_pooled.fasta" # filename_pooled_gsa_mapping = "pooled_" + filename_gsa_anonymous_mapping # filename_gsa = "gsa.fasta" # filename_pooled_gsa = "pooled_" + filename_gsa _filename_reads_mapping = "reads_mapping.tsv" _filename_gsa_mapping = "gsa_mapping.tsv" _filename_pooled_gsa_mapping = "gsa_pooled_mapping.tsv" _filename_log = "pipeline.log" _filename_metadata = "meta_data.tsv" def __init__(self, tmp_dir, output_dir, time_stamp=None, logfile=None, verbose=True, debug=False): """ Constructor @param tmp_dir: Directory for temporary data @type tmp_dir: str | unicode @param output_dir: Directory where final data will be placed @type output_dir: str | unicode @param time_stamp: timestamp as string @type time_stamp: str | unicode @param logfile: file | FileIO | StringIO | basestring @param verbose: Not verbose means that only warnings and errors will be past to stream @type verbose: bool @param debug: Display debug messages @type debug: bool """ assert isinstance(tmp_dir, basestring) assert isinstance(output_dir, basestring) assert time_stamp is None or isinstance(time_stamp, basestring) self._tmp_dir = tempfile.mkdtemp(dir=tmp_dir) self._directory_output = output_dir self._time_stamp = time_stamp if time_stamp is None: self._time_stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y.%m.%d_%H.%M.%S') super(ProjectFileFolderHandle, self).__init__(logfile, verbose, debug) def get_time_stamp(self): return self._time_stamp def get_output_directory(self): """ Get directory where final data will be placed @return: Directory where final data will be placed @rtype: str | unicode """ return self._directory_output def remove_directory_temp(self): """ Delete temporary data @return: Nothing @rtype: None """ if os.path.exists(self._tmp_dir): assert os.path.isdir(self._tmp_dir) shutil.rmtree(self._tmp_dir) def make_directory_structure(self, number_of_samples): """ Create folder structure at output and temporary location @param number_of_samples: Number of samples. @type number_of_samples: int | long @return: Nothing @rtype: None """ assert isinstance(number_of_samples, (int, long)) self.make_directory_temp_structure(number_of_samples) self.make_directory_output_structure(number_of_samples) def make_directory_temp_structure(self, number_of_samples): """ Create folder structure at temporary location @param number_of_samples: Number of samples. @type number_of_samples: int | long @return: Nothing @rtype: None """ assert isinstance(number_of_samples, (int, long)) self._make_directory_structure(self._TMP, number_of_samples) def make_directory_output_structure(self, number_of_samples): """ Create folder structure at output location @param number_of_samples: Number of samples. @type number_of_samples: int | long @return: Nothing @rtype: None """ assert isinstance(number_of_samples, (int, long)) self._make_directory_structure(self._HardDrive, number_of_samples) def _make_directory_structure(self, is_tmp, number_of_samples): """ Create folder structure at temporary location @param is_tmp: Location where the directory structure is to be created. @type is_tmp: bool @param number_of_samples: Number of samples. @type number_of_samples: int | long @return: Nothing @rtype: None """ assert isinstance(is_tmp, bool) assert isinstance(number_of_samples, (int, long)) dir_main = self._get_root_directory(is_tmp) self._make_dir(dir_main) for sub_folder in self._sub_folders_output: directory = os.path.join(dir_main, sub_folder) self._make_dir(directory) for sample_index in range(number_of_samples): dir_sample = self.get_sample_dir(is_tmp, str(sample_index)) self._make_dir(dir_sample) for sub_folder in self._sub_folders_sample: sub_directory = os.path.join(dir_sample, sub_folder) self._make_dir(sub_directory) def _make_dir(self, directory): """ Create folder at given location, it it does not exists already. @param directory: Number of samples. @type directory: str | unicode @return: Nothing @rtype: None """ assert self.validate_dir(directory, only_parent=True) if os.path.exists(directory): assert os.path.isdir(directory) else: os.mkdir(directory) def get_tmp_wd(self): """ Get location of temporary working directory. @return: temporary working directory @rtype: str | unicode """ return self._tmp_dir def _get_root_directory(self, is_tmp): """ Get root directory baseed on whether it is at a temporary location or output location. @type is_tmp: bool @return: temporary working directory @rtype: str | unicode """ if is_tmp: return self._tmp_dir else: return self._directory_output # ################### # directories # ################### def get_bam_dirs(self): """ Get list of bam directories of all samples @attention: The list includes previous runs! @return: List of bam directories @rtype: list[str|unicode] """ out_dir = self.get_output_directory() list_of_dirs = [ os.path.join(out_dir, folder_name) for folder_name in os.listdir(out_dir) if os.path.isdir(os.path.join(out_dir, folder_name))] sample_dirs = sorted([ directory for directory in list_of_dirs if self.validate_dir(directory, sub_directories=self._sub_folders_sample, silent=True)]) return [os.path.join(sample_dir, self._folder_name_bam) for sample_dir in sample_dirs] def get_distribution_dir(self): """ Get directory where distribution files are located. @return: distribution directory @rtype: str | unicode """ root_dir = self._directory_output return os.path.join(root_dir, self._folder_name_distribution) def get_genome_dir(self): """ Get directory where genome files are located. @return: distribution directory @rtype: str | unicode """ root_dir = self._directory_output return os.path.join(root_dir, self._folder_name_genomes) def get_meta_data_dir(self): """ Get directory where metadata files are located. @return: metadata directory @rtype: str | unicode """ root_dir = self._directory_output return os.path.join(root_dir, self._folder_name_internal) def get_bam_dir(self, sample_id): """ Get directory where bam files are located. @type sample_id: str | unicode @return: bam directory @rtype: str | unicode """ assert isinstance(sample_id, basestring) sample_dir = self.get_sample_dir(self._HardDrive, sample_id) return os.path.join(sample_dir, self._folder_name_bam) def get_reads_dir(self, is_input, sample_id): """ Get directory where fastq files are located. @type is_input: bool @type sample_id: str | unicode @return: fastq directory @rtype: str | unicode """ assert isinstance(is_input, bool) assert isinstance(sample_id, basestring) if is_input: sample_dir = self.get_sample_dir(self._location_reads[0], sample_id) else: sample_dir = self.get_sample_dir(self._HardDrive, sample_id) return os.path.join(sample_dir, self._folder_name_reads) def get_contigs_dir(self, is_input, sample_id): """ Get directory where fastq files are located. @type is_input: bool @type sample_id: str | unicode @return: fastq directory @rtype: str | unicode """ assert isinstance(is_input, bool) assert isinstance(sample_id, basestring) if is_input: sample_dir = self.get_sample_dir(self._location_reads[0], sample_id) else: sample_dir = self.get_sample_dir(self._HardDrive, sample_id) return os.path.join(sample_dir, self._folder_name_contigs) def get_logfile_dir(self): """ Get directory where log files are located. @return: logfile directory @rtype: str | unicode """ root_dir = self._directory_output return os.path.join(root_dir, self._folder_name_internal) def get_sample_dir(self, is_tmp, sample_id): """ Get directory where sample files are located. @type is_tmp: bool @type sample_id: str | unicode @return: sample directory @rtype: str | unicode """ assert isinstance(is_tmp, bool) assert isinstance(sample_id, basestring) root_dir = self._get_root_directory(is_tmp) folder_name = "{}_{}".format(self._time_stamp, self._folder_name_sample.format(id=sample_id)) return os.path.join(root_dir, folder_name) # ################### # file paths # ################### def get_anonymous_gsa_pooled_file_path(self): """ Get file location of the gold standard assembly based on pooled sample reads. @return: file location of pooled gold standard assembly @rtype: str | unicode """ root_dir = self._get_root_directory(self._HardDrive) return os.path.join( root_dir, self._filename_anonymous_gsa_pooled) def get_gsa_pooled_file_path(self): """ Get file location of the gold standard assembly based on pooled sample reads. @return: file location of pooled gold standard assembly @rtype: str | unicode """ root_dir = self._get_root_directory(self._HardDrive) return os.path.join( root_dir, self._filename_gsa_pooled) def get_anonymous_gsa_pooled_map_file_path(self): """ Get file location of the anonymous gold standard assembly based on pooled sample reads. @return: file location of anonymous pooled gold standard assembly @rtype: str | unicode """ root_dir = self._get_root_directory(self._HardDrive) return os.path.join( root_dir, self._filename_pooled_gsa_mapping) def get_gsa_file_path(self, sample_id): """ Get file location of the anonymous gold standard assembly. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of anonymous gold standard assembly @rtype: str | unicode """ assert isinstance(sample_id, basestring) output_dir = self.get_contigs_dir(self._HardDrive, sample_id) return os.path.join( output_dir, self._filename_gsa) def get_anonymous_gsa_file_path(self, sample_id): """ Get file location of the anonymous gold standard assembly. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of anonymous gold standard assembly @rtype: str | unicode """ assert isinstance(sample_id, basestring) output_dir = self.get_contigs_dir(self._HardDrive, sample_id) return os.path.join(output_dir, self._filename_anonymous_gsa) def get_anonymous_gsa_map_file_path(self, sample_id): """ Get file location of the anonymous gold standard assembly mapping. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of anonymous gold standard assembly mapping @rtype: str | unicode """ assert isinstance(sample_id, basestring) output_dir = self.get_contigs_dir(self._HardDrive, sample_id) return os.path.join(output_dir, self._filename_gsa_mapping) def get_anonymous_reads_file_path(self, sample_id): """ Get file location of the anonymous gold standard assembly mapping. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of anonymous gold standard assembly mapping @rtype: str | unicode """ assert isinstance(sample_id, basestring) fastq_dir = self.get_reads_dir(self._HardDrive, sample_id) return os.path.join(fastq_dir, self._filename_anonymous_reads) def get_anonymous_reads_map_file_path(self, sample_id): """ Get file location of the anonymous reads mapping. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of anonymous reads mapping @rtype: str | unicode """ assert isinstance(sample_id, basestring) fastq_dir = self.get_reads_dir(self._HardDrive, sample_id) return os.path.join(fastq_dir, self._filename_reads_mapping) def get_distribution_file_path(self, sample_id): """ Get file location of a distribution file of a specific sample. @param sample_id: sample id of a sample @type sample_id: str | unicode @return: file location of distribution file @rtype: str | unicode """ assert isinstance(sample_id, basestring) return os.path.join( self.get_sample_dir(self._HardDrive, sample_id), self._filename_distribution) def get_distribution_file_path_list(self, number_of_samples): """ Get file locations of all distribution files. @param number_of_samples: Number of samples. @type number_of_samples: int | long @return: file location of distribution file @rtype: str | unicode """ assert isinstance(number_of_samples, (int, long)) return [self.get_distribution_file_path(str(sample_index)) for sample_index in range(number_of_samples)] def get_genome_location_file_path(self): """ Get file location of file containing genome locations by genome ids. @return: file location of file containing genome locations by genome ids. @rtype: str | unicode """ root_dir = self._directory_output return os.path.join( root_dir, self._folder_name_internal, self._filename_genome_locations) def get_log_file_path(self): """ Get logfile location. @return: logfile location. @rtype: str | unicode """ root_dir = self._directory_output return os.path.join( root_dir, self._folder_name_internal, self._filename_log) def get_genome_metadata_file_path(self): """ Get metadata file location. @return: metadata file location. @rtype: str | unicode """ root_dir = self._directory_output return os.path.join( root_dir, self._folder_name_internal, self._filename_metadata)