import numpy as np

def read_fasta( fasta_file, split_char=' ', id_field=0 ):
    '''
        Reads in fasta file containing multiple sequences.
        Returns dictionary holding multiple sequences or only single 
        sequence, depending on input file
        In order to retrieve the protein identifier, the header is split 
        after split_char and the field at position id_field is chosen as
        identifier.
    '''
    sequences = dict()
    with open( fasta_file, 'r' ) as fasta_f:
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                sequences[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines
                sequences[ uniprot_id ] += ''.join( line.split() ).upper()

    sequences = sorted(sequences.items(), key=lambda kv: len( sequences[kv[0]] ) )
    identifier, seqs = zip(*sequences)
    seqs = [ np.asarray([seq]) for seq in seqs ]
    #seqs = np.concatenate( seqs )
    print(seqs)
    return { "inputs": seqs,
             "metadata":
                { "id":  identifier }
            }