Cryptanalib - A series of useful functions for cryptanalysis
by Daniel "unicornFurnace" Crowley

dependencies - PyCrypto

from Crypto.Util import number
from Crypto.Hash import *
from Crypto.PublicKey import RSA

from decimal import *
import string
import frequency
import zlib
import urllib

# Helper functions
# This section contains various functions that are not terribly
# useful on their own, but allow other functionality to work

def nroot(x, n):
   Return integer nth root of x.
   if n <= 0:
      raise ValueError("can't do negative or zero root")

   getcontext().prec = max(30,len(str(x)))
   approx_root = Decimal(x) ** (Decimal(1) / Decimal(n))
   if pow(floor(approx_root),n) == x:
      return floor(approx_root)
      return ceil(approx_root)

def floor(number):
   Return the closest integer <= number.
   return number // 1

def ceil(number):
   Return the closest integer >= number.
   floored = number // 1
   if number == floored:
      return number
      return floored + 1

def bit_length(input):
   Return the bit length of input.
   EX: 7 (0b111) has length 3
   EX: 8 (0b1000) has length 4
   return len(bin(input)) - 2

# Blinding and unblinding funcs taken graciously from PyCrypto PubKey/RSA/_slowmath.py
def rsa_blind(message, randint, exponent, modulus):
   Return message RSA-blinded with integer randint for a keypair
   with the provided public exponent and modulus.
   return (message * pow(randint, exponent, modulus)) % modulus

def rsa_unblind(message, randint, modulus):
   Return message RSA-unblinded with integer randint for a keypair
   with the provided modulus.
   return number.inverse(randint, modulus) * message % modulus

def check_rsa_key(sample):
   Returns a 3-tuple (is_rsa_key, has_private_component, n_bit_length)
   is_rsa_key - a bool indicating that the sample is, in fact, an RSA key
      in a format readable by Crypto.PublicKey.RSA.importKey
   has_private_component - a bool indicating whether or not d was in the
      analyzed key, or false if the sample is not an RSA key
   n_bit_length - an int representing the bit length of the modulus found
      in the analyzed key, or False if the sample is not an RSA key
   is_rsa_key = has_private_component = n_bit_length = False

      rsakey = RSA.importKey(sample.strip())
      is_rsa_key = True
      if rsakey.has_private():
         has_private_component = True
      n_bit_length = bit_length(rsakey.n)
   # Don't really care why it fails, just want to see if it did
      is_rsa_key = False
   return (is_rsa_key, has_private_component, n_bit_length)

def show_histogram(frequency_table, width=80, sort=True):
   Take a frequency distribution, such as one generated by
   generate_frequency_table() and represent it as a histogram with the
   specified width in characters

   frequency_table - A frequency distribution
   width - The width in characters for the histogram
   sort - (bool) Sort the histogram by frequency value?
   max_value = max(frequency_table.values())
   normalizing_multiplier = width / max_value

   if sort:
      frequency_table = sorted(frequency_table.items(),key=lambda (k,v): (v,k), reverse=True)
      frequency_table = frequency_table.items()

   print '0%' + ' ' * (width-6) + str(max_value*100)+'%'
   print '-' * width
   for key, value in frequency_table:
      freq_bars = int(value * normalizing_multiplier)
      if freq_bars != 0:
         print key + '|' + '=' * freq_bars

def is_base64_encoded(sample):
   Check if a sample is likely base64-encoded
   sample - (string) The sample to evaluate
   base64chars = string.letters + string.digits + string.whitespace
   base64chars += '/+='
   # Turns out a lot of crazy things will b64-decode happily with
   # sample.decode('base64'). This is the fix.
   if any([char not in base64chars for char in sample]):
      return False
      return True
      return False

def is_hex_encoded(sample):
   Check if a sample hex-decodes without error

   sample - (string) The sample to evaluate
      return True
      return False

def is_url_encoded(sample):
   Returns True or False to indicate whether a sample is URL-encoded

   EX: Returns True for "abc%20def"
   EX: Returns False for "foobar"
   return sample != urllib.unquote(sample)

def is_zlib_compressed(sample):
   Check if some sample can be zlib decompressed without error
   sample - (string) The sample to evaluate
      return True
      return False

def detect_polybius(sample):
   Detect the use of the polybius cipher
   sample - (string) The sample to evaluate
   correct_charset = all([char in ' 01234567' for char in sample])
   correct_length = len(filter(lambda x: x in '01234567',sample)) % 2 == 0
   return correct_charset and correct_length

def monte_carlo_pi(sample):
   Monte Carlo Pi estimation test
   Good for determining the randomness of data, especially when looking at compressed
   vs encrypted data.
   Returns the estimated value of Pi. The closer the returned value to the value of Pi,
   the more entropy in the data.
   sample - (string) The sample to evaluate
   # cut down our sample to a multiple of four bytes in length so we
   # can take two two-byte samples for x/y coords
   if len(sample) < 4:
      return False
   if (len(sample) % 4) != 0:
      sample = sample[:-(len(sample)%4)]
   coords = []
   hits = 0
   for offset in range(0,len(sample),4):
      # extract four bytes from sample
      subsample = sample[offset:offset+4]
      # interpret the first two bytes as an X value between -32512.5 and 32512.5
      subsample_x = ((ord(subsample[0])*255)+(ord(subsample[1])))-32512.5
      # map this value down to one between -1.0 and 1.0
      subsample_x /= 32512.5
      # interpret the next two bytes as a Y value between -32512.5 and 32512.5
      subsample_y = ((ord(subsample[2])*255)+(ord(subsample[3])))-32512.5
      # map this value down to one between -1.0 and 1.0
      subsample_y /= 32512.5
   for coordinate in coords:
      if coordinate[0]**2 + coordinate[1]**2 <= 1:
         hits += 1
   pi_estimate = 4*(float(hits) / (len(sample)/4))
   return pi_estimate

def check_key_reuse(samples):
   Check for key reuse between two or more messages
   Returns a boolean indicating whether two messages have high or low
   bitwise correspondence, which suggests key reuse.
   samples - (list) Two or more samples for evaluation
   samples = filter(lambda x: len(x) is not 0, samples)
   if len(samples) == 1:
      print 'Need more than one non-null sample'
      return None
   total_length = total_hamming_distance = 0
   for sample in samples[1:]:
      compare_length = min(len(samples[0]),len(sample))
      sample_hamming_distance = hamming_distance(samples[0],sample)
      total_hamming_distance += sample_hamming_distance
      total_length += compare_length
   mean_hamming_distance = total_hamming_distance / float(total_length)
   return ((mean_hamming_distance < 3.25) or (mean_hamming_distance > 4.75))

def do_simple_substitution(ciphertext, pt_charset, ct_charset):
   Perform simple substitution based on character sets

   Simplifies the use of string.translate(). If, for instance, you wish to
   transform a ciphertext where 'e' is swapped with 't', you would call this
   function like so:

   do_simple_substitution('Simplt subeieueion ciphtrs art silly','et','te')
   ciphertext - A string to translate
   pt_charset - The character set of the plaintext, usually 'abcdefghijk...xyz'
   ct_charset - The character set of the ciphertext
   #translate ciphertext to plaintext using mapping
   return string.translate(ciphertext, string.maketrans(ct_charset, pt_charset))

# TODO: Implement chi square
def is_random(sample, verbose=False, boolean_results=True):
   Run randomness tests to determine likelihood of data being
   the output of strong crypto or CSPRNG or RNG a la ent
   with boolean_results=True
   Returns a boolean indicating whether all tests for randomness have passed
   with boolean_results=False
   Returns detailed results about what tests passed/failed

   sample - A string to evaluate for signs of randomness
   verbose - (bool) Whether to print information about results or not
   boolean_results - (bool) Whether to return True/False or show more details
      on what tests passed or failed
   results = {}
   sample_length = len(sample)
   if sample_length == 0:
      return False
   if sample_length < 100:
      if verbose:
         print '[*] Warning! Small sample size, results may be unreliable.'
   # Arithmetic mean test
   mean = sum([ord(char) for char in sample])/float(sample_length)
   if verbose:
      print '[+] Arithmetic mean of sample is '+str(mean)+'. (127.5 = random)'
   if ((mean <= 110) or (mean >= 145)):
      results['mean_failed'] = True
      if verbose:
         print '[!] Arithmetic mean of sample suggests non-random data.'
      results['mean_failed'] = False
   # Byte and digraph count test
   byte_count = generate_frequency_table(sample, map(chr,range(256)))
   min_to_max = max(byte_count.values())-min(byte_count.values())
   if verbose:
      print '[+] Distance between lowest and highest byte frequencies is '+str(min_to_max)+'.'
      print '[+] Distance for 100+ random bytes of data generally does not exceed 0.4'
   if min_to_max > 0.4:
      results['byte_count_failed'] = True
      if verbose:
         print '[!] Distance between byte frequencies suggests non-random data.'
      results['byte_count_failed'] = False
   # Longest bit run test
   binary_message = ''.join(['{0:08b}'.format(ord(char)) for char in sample])
   longest_bit_run_threshold = 20
   longest_run = 0
   current_run = 0
   prev_bit = None
   for bit in binary_message:
      if bit == prev_bit:
         current_run += 1
         current_run = 0
      if current_run > longest_run:
         longest_run = current_run
      prev_bit = bit
   if verbose:
      print '[+] Longest same-bit run in the provided sample is %s' % str(longest_run)
      print '[+] This value generally doesn\'t exceed 20 in random data.'
   results['bit_run_failed'] = (longest_run >= longest_bit_run_threshold)
   if results['bit_run_failed'] and verbose:
      print '[!] Long same-bit run suggests non-random data.'
   # Monte Carlo estimation of Pi test
   approximate_pi = 3.141592654
   monte_carlo_pi_value_deviation = abs(approximate_pi - monte_carlo_pi(sample)) 
   results['monte_carlo_failed'] = (monte_carlo_pi_value_deviation > 0.4)
   if verbose:
      print '[+] Deviation between the approx. value of pi and the one generated by this sample using Monte Carlo estimation is %s' % str(monte_carlo_pi_value_deviation)
      print '[+] Deviation for 100+ random bytes of data generally does not exceed 0.4.'
   if results['monte_carlo_failed'] and verbose:
      print '[!] Deviation exceeds 0.4. If no other randomness tests failed, this data may be compressed, not encrypted or random.'
   # Compression ratio test
   compression_ratio = len(zlib.compress(sample,9)) / float(len(sample))
   if verbose:
      print '[+] Zlib best compression ratio is {0:.0f}%'.format(compression_ratio * 100)
      print '[+] Compression ratio for random data is unlikely to be lower than 95%.'
   results['compression_ratio_failed'] = (compression_ratio <= .95)
   if boolean_results:
      if any(results.values()):
         if verbose:
            print '[!] One or more tests for randomness suggests non-random data.'
            print '[!] This data may be the result of weak encryption like XOR.'
            print '[!] This may also suggest a fixed IV or ECB mode.'
            print '[!] This data may also be simply compressed or in a proprietary format.'
         return False
         if verbose:
            print '[+] This data has passed all randomness tests performed.'
            print '[+] This suggests data generated by a RNG, CSPRNG, or strong encryption.'
         return True
      if verbose:
         if sum(results.values()) == 1:
            if results['monte_carlo_failed']:
               print '[+] Only the Monte Carlo Pi generation test has failed. This may indicate that the data is not encrypted, but simply compressed.'
            elif results['bit_run_failed']:
               print '[+] Only the longest-bit-run test has failed. This suggests that certain portions of the data are not encrypted.'
      return results

def gcd(a,b):
   Wrapper around extended_gcd() that simply returns the GCD alone.
   return extended_gcd(a,b)[2]

def extended_gcd(a, b): 
   Euclid's GCD algorithm, but with the addition that the last x and y values are returned.

   a, b - Two integers to find common factors for

   Returns (Last X value, Last Y value, Greatest common divisor)
   x,y = 0, 1
   lastx, lasty = 1, 0

   while b:
      a, (q, b) = b, divmod(a,b)
      x, lastx = lastx-q*x, x
      y, lasty = lasty-q*y, y

   return (lastx, lasty, a)

def chinese_remainder_theorem(items):
   The Chinese Remainder Theorem algorithm.

   items - A list of 2-tuples such as [(a1, n1),(a2, n2)] that map to congruences:
      a1 is congruent to x mod n1
      a2 is congruent to x mod n2
   N = 1 
   for a, n in items:
      N *= n

   result = 0 
   for a, n in items:
      m = N/n 
      r, s, d = extended_gcd(n, m)
      if d != 1:
         raise "Input not pairwise co-prime"
      result += a*s*m

   return result % N, N

def detect_block_cipher(ciphertext):
   Detect block cipher by length of ciphertext
   Return largest identified block size, or False if none

   ciphertext - (string) A sample to be evaluated for common block sizes
   for candidate_blocksize in [32,16,8]:
      if len(ciphertext) % candidate_blocksize == 0:
         return candidate_blocksize
   return False

def detect_plaintext(candidate_text, pt_freq_table=frequency.frequency_tables['english_letters'], detect_words=True, common_words=frequency.common_words['english'], individual_scores=False):
   Return score for likelihood that string is plaintext
   in specified language as a measure of deviation from
   expected frequency values (lower is better)

   candidate_text - (string) The sample to check for plaintext-like properties

   pt_freq_table - Expected frequency distribution for the plaintext, as generated
      by generate_frequency_table(). If only individual character frequency should
      be matched, ensure you're using a frequency table with only single character
      frequencies. If you're using the built-in tables, these are prefixed with

   detect_words - (bool) Use a list of strings expected in the correct plaintext,
      aka 'cribs'.
      This can be used in a number of ways. For instance, when attempting to decrypt
      firmware, '\x00\x00\x00\x00\x00' may be a useful crib. When attempting to
      decrypt a PNG file, 'IHDR', 'IDAT', and 'IEND' are useful cribs.

   common_words - (list of strings) Words that are likely to appear in the plaintext.
      Requires detect_words=True.

   individual_scores - (bool) Whether or not to return a tuple with individual scores.

   # generate score as deviation from expected character frequency
   pt_freq_table_keys = pt_freq_table.keys()
   candidate_dict = generate_frequency_table(candidate_text, pt_freq_table_keys)
   char_deviation_score = 0
   for char in pt_freq_table_keys:
      char_deviation_score += abs(candidate_dict[char]-pt_freq_table[char])

   # generate score as total number of letters in common words found in sample
   word_count_score = 0
   if detect_words:
      word_count_score = count_words(candidate_text, common_words=common_words)
   if individual_scores:
      return (char_deviation_score, word_count_score)
      if word_count_score == 0:
         score = 1
         score = 1.0/word_count_score
      score += char_deviation_score
      return score

def generate_optimized_charset_from_frequency(freq_table, include_zero_freq=False):
   Given a character frequency table such as those returned by generate_frequency_table(),
   return a string with only single characters sorted by frequency of occurance descending
   # Filter out frequency items to only single characters
   single_char_freq_table = dict(filter(lambda x: len(x[0])==1, freq_table.items()))
   # Filter out items which never occur
   if not include_zero_freq:
      single_char_freq_table = dict(filter(lambda x: x[1] != 0, single_char_freq_table.items()))
   # Sort items by frequency, concatenate characters and return as a single string
   return ''.join([x[0] for x in sorted(single_char_freq_table.items(), key=lambda x: x[1], reverse=True)])

def generate_frequency_table(text,charset):
   Generate a character frequency table for a given text
   and charset as dict with character or string as key and
   frequency of appearance as value expressed as a decimal

   text - A sample of plaintext to analyze for frequency data
   charset - (list of strings) The set of items to count in the plaintext
      such as ['a','b','c', ... 'z','aa','ab','ac', ... 'zz']
   freq_table = {}
   text_len = 0 
   for char in charset:
      freq_table[char] = 0 
   for char in text:
      if char in charset:
         freq_table[char] += 1
         text_len += 1
   for multigraph in filter(lambda x: len(x)>1,charset):
      freq_table[multigraph] = string.count(text, multigraph)
   # Normalize frequencies with length of text
   for key in freq_table.keys():
      if text_len != 0:
         freq_table[key] /= float(text_len)
         freq_table[key] = 0 
   return freq_table

def generate_optimized_charset(text, include_zero_freq=False):
   Given a sample text, generate a frequency table and
   convert it to a string of characters sorted by frequency
   of appearance in the text. This can be used directly in
   some of the other cryptanalib functions, such as our
   Vaudenay padding oracle decryption function.

   (string) text - The corpus of text from which to learn
      frequency data.

   all_chars = map(chr, range(256))
   freq_table = generate_frequency_table(text, charset=all_chars)
   return generate_optimized_charset_from_frequency(freq_table, include_zero_freq=include_zero_freq)
def hamming_distance(string1, string2):
   Calculate and return bitwise hamming distance between two strings
   string1 - The first string to compare
   string2 - The second string to compare
   distance = 0
   for char1, char2 in zip(string1, string2):
      for digit1, digit2 in zip('{0:08b}'.format(ord(char1)),'{0:08b}'.format(ord(char2))):
         if digit1 != digit2:
            distance += 1
   return distance

def output_mask(text, charset):
   Output masking - mask all characters besides those in the provided character
   set with dots.
   (string) text - output to mask
   (string) charset - string containing acceptable characters
   all_chars = output_chars = map(chr,range(256))
   charset = set(charset)
   for charnum in range(256):
      if all_chars[charnum] not in charset:
         output_chars[charnum] = '.'
   return string.translate(text,''.join(output_chars))

def string_to_long(instring):
   Take a raw string and convert it to a number
   instring - String to convert
   return long(instring.encode("hex"),16)

def long_to_string(inlong):
   Take a long and convert it to a string
   inlong - Long to convert
   hex_encoded = hex(inlong)[2:-1]
   if len(hex_encoded) % 2 == 1:
      return ('0'+hex_encoded).decode('hex')
      return hex_encoded.decode('hex')

def split_into_blocks(ciphertext,blocksize):
   Split a string into blocks of length blocksize

   ciphertext - A string to be split
   blocksize - The size in bytes of blocks to output
   ciphertext_len = len(ciphertext)
   return [ciphertext[offset:offset+blocksize] for offset in xrange(0,ciphertext_len,blocksize)]

def sxor(string1, string2):
   XOR two strings and return the result up to the length
   of the shorter string

   string1 - The first string to be XORed
   string2 - The second string to be XORed
   return ''.join(chr(ord(chr1)^ord(chr2)) for chr1, chr2 in zip(string1,string2))

def count_words(candidate_text, common_words=frequency.common_words['english'], case_sensitive=True):
   Count the instances of common words in the expected plaintext
   language, return the total number of characters matched in each

   candidate_text - (string) Sample to analyze
   common_words - (list) Sequences expected to appear in the text
   case_sensitive - (bool) Whether or not to match case sensitively
   score = 0

   for word in common_words:
      if not case_sensitive:
         word = word.lower()
      num_found = candidate_text.count(word)
      if num_found > 0:
         score += num_found * len(word)
   return score

def make_polybius_square(password,extended=False):
   Polybius square generator. Returns a list of strings of equal
   length, either 5x5 or 6x6 depending on whether extended
   Polybius mode is on. Assumes I/J are represented as one letter

   password - (string) The password to use when generating the polybius square
   extended - (bool) Set to True to use a 6x6 square instead of a 5x5
   alphabet = string.lowercase
   if extended == True:
      alphabet += string.digits
      alphabet = string.replace(string.lowercase, 'j', '')
      password = string.replace(password, 'j', 'i')
   if any([x not in alphabet for x in set(password)]):
      return False
   unique_letters = []
   for letter in password:
      if letter not in unique_letters:
   for letter in unique_letters:
      alphabet = string.replace(alphabet, letter, '')
   for letter in unique_letters[::-1]:
      alphabet = letter + alphabet
   ps = []
   alphabet_len = len(alphabet)
   grid_size = 5 + int(extended) # Not necessary, but looks cleaner
   for index in xrange(0,alphabet_len,grid_size):
   return ps

def polybius_decrypt(ps, ciphertext):
   Decrypt given a polybius square (such as one generated
   by make_polybius_square() ) and a ciphertext.

   ps - A polybius square as generated by make_polybius_square()
   ciphertext - A string to decrypt
   ct_len = len(ciphertext)
   if (ct_len % 2) != 0:
      return False
   digraphs = []
   plaintext = ''
   for index in xrange(0,ct_len,2):
   for digraph in digraphs:
      x = int(digraph[0]) - 1
      y = int(digraph[1]) - 1
      plaintext += ps[y][x]
   return plaintext

def detect_ecb(ciphertext):
   Attempts to detect use of ECB by detecting duplicate blocks using common
   block sizes.
   ciphertext - A string to analyze for the indicators of ECB mode
   ciphertext_len = len(ciphertext)
   for blocksize in [32,16,8]:
      if ciphertext_len % blocksize == 0:
         blocks = split_into_blocks(ciphertext,blocksize)
         seen = set()
         for block in blocks:
            if block in seen:
               return (True, blocksize, block)
   return (False, 0, '')

def pkcs7_padding_remove(text, blocksize):
   PKCS7 padding remove - returns unpadded string if successful, returns False if unsuccessful

   text - The text to pkcs7-unpad
   blocksize - The blocksize of the text
   last_byte = ord(text[-1:])
   if last_byte > blocksize:
      return False
   if text[-last_byte:] != chr(last_byte)*last_byte:
      return False
      return text[:-last_byte]

def pkcs7_pad(text, blocksize):
   PKCS7 padding function, returns text with PKCS7 style padding
   text - The text to pkcs7-pad
   blocksize - The blocksize of the text
   pad_num = (blocksize - len(text)%blocksize)
   return text+chr(pad_num)*pad_num

def derive_d_from_pqe(p,q,e):
   Given p, q, and e from factored RSA modulus, derive the private component d
   p - The first of the two factors of the modulus
   q - The second of the two factors of the modulus
   e - The public exponent
   return long(number.inverse(e,(p-1)*(q-1)))