```'''
Cryptanalib - A series of useful functions for cryptanalysis
by Daniel "unicornFurnace" Crowley

dependencies - PyCrypto
'''

from Crypto.Util import number
from Crypto.Hash import *
from Crypto.PublicKey import RSA

from decimal import *
import string
import frequency
import zlib
import urllib

#------------------------------------
# Helper functions
#
# This section contains various functions that are not terribly
# useful on their own, but allow other functionality to work
#------------------------------------

def nroot(x, n):
"""
Return integer nth root of x.
"""
if n <= 0:
raise ValueError("can't do negative or zero root")

getcontext().prec = max(30,len(str(x)))
approx_root = Decimal(x) ** (Decimal(1) / Decimal(n))
if pow(floor(approx_root),n) == x:
return floor(approx_root)
else:
return ceil(approx_root)

def floor(number):
"""
Return the closest integer <= number.
"""
return number // 1

def ceil(number):
"""
Return the closest integer >= number.
"""
floored = number // 1
if number == floored:
return number
else:
return floored + 1

def bit_length(input):
"""
Return the bit length of input.
EX: 7 (0b111) has length 3
EX: 8 (0b1000) has length 4
"""
return len(bin(input)) - 2

# Blinding and unblinding funcs taken graciously from PyCrypto PubKey/RSA/_slowmath.py
def rsa_blind(message, randint, exponent, modulus):
"""
Return message RSA-blinded with integer randint for a keypair
with the provided public exponent and modulus.
"""
return (message * pow(randint, exponent, modulus)) % modulus

def rsa_unblind(message, randint, modulus):
"""
Return message RSA-unblinded with integer randint for a keypair
with the provided modulus.
"""
return number.inverse(randint, modulus) * message % modulus

def check_rsa_key(sample):
"""
Returns a 3-tuple (is_rsa_key, has_private_component, n_bit_length)

is_rsa_key - a bool indicating that the sample is, in fact, an RSA key
in a format readable by Crypto.PublicKey.RSA.importKey
has_private_component - a bool indicating whether or not d was in the
analyzed key, or false if the sample is not an RSA key
n_bit_length - an int representing the bit length of the modulus found
in the analyzed key, or False if the sample is not an RSA key
"""
is_rsa_key = has_private_component = n_bit_length = False

try:
rsakey = RSA.importKey(sample.strip())
is_rsa_key = True
if rsakey.has_private():
has_private_component = True
n_bit_length = bit_length(rsakey.n)
# Don't really care why it fails, just want to see if it did
except:
is_rsa_key = False
return (is_rsa_key, has_private_component, n_bit_length)

def show_histogram(frequency_table, width=80, sort=True):
'''
Take a frequency distribution, such as one generated by
generate_frequency_table() and represent it as a histogram with the
specified width in characters

frequency_table - A frequency distribution
width - The width in characters for the histogram
sort - (bool) Sort the histogram by frequency value?
'''
max_value = max(frequency_table.values())
normalizing_multiplier = width / max_value

if sort:
frequency_table = sorted(frequency_table.items(),key=lambda (k,v): (v,k), reverse=True)
else:
frequency_table = frequency_table.items()

print '0%' + ' ' * (width-6) + str(max_value*100)+'%'
print '-' * width

for key, value in frequency_table:
freq_bars = int(value * normalizing_multiplier)
if freq_bars != 0:
print key + '|' + '=' * freq_bars

def is_base64_encoded(sample):
'''
Check if a sample is likely base64-encoded

sample - (string) The sample to evaluate
'''
base64chars = string.letters + string.digits + string.whitespace
base64chars += '/+='
# Turns out a lot of crazy things will b64-decode happily with
# sample.decode('base64'). This is the fix.
if any([char not in base64chars for char in sample]):
return False
try:
sample.decode('base64')
return True
except:
return False

def is_hex_encoded(sample):
'''
Check if a sample hex-decodes without error

sample - (string) The sample to evaluate
'''
try:
sample.decode('hex')
return True
except:
return False

def is_url_encoded(sample):
"""
Returns True or False to indicate whether a sample is URL-encoded

EX: Returns True for "abc%20def"
EX: Returns False for "foobar"
"""
return sample != urllib.unquote(sample)

def is_zlib_compressed(sample):
'''
Check if some sample can be zlib decompressed without error

sample - (string) The sample to evaluate
'''
try:
zlib.decompress(sample)
return True
except:
return False

def detect_polybius(sample):
'''
Detect the use of the polybius cipher

sample - (string) The sample to evaluate
'''
correct_charset = all([char in ' 01234567' for char in sample])
correct_length = len(filter(lambda x: x in '01234567',sample)) % 2 == 0
return correct_charset and correct_length

def monte_carlo_pi(sample):
'''
Monte Carlo Pi estimation test

Good for determining the randomness of data, especially when looking at compressed
vs encrypted data.

Returns the estimated value of Pi. The closer the returned value to the value of Pi,
the more entropy in the data.

sample - (string) The sample to evaluate
'''
# cut down our sample to a multiple of four bytes in length so we
# can take two two-byte samples for x/y coords
if len(sample) < 4:
return False
if (len(sample) % 4) != 0:
sample = sample[:-(len(sample)%4)]
coords = []
hits = 0
for offset in range(0,len(sample),4):
# extract four bytes from sample
subsample = sample[offset:offset+4]
# interpret the first two bytes as an X value between -32512.5 and 32512.5
subsample_x = ((ord(subsample)*255)+(ord(subsample)))-32512.5
# map this value down to one between -1.0 and 1.0
subsample_x /= 32512.5
# interpret the next two bytes as a Y value between -32512.5 and 32512.5
subsample_y = ((ord(subsample)*255)+(ord(subsample)))-32512.5
# map this value down to one between -1.0 and 1.0
subsample_y /= 32512.5
coords.append((subsample_x,subsample_y))
for coordinate in coords:
if coordinate**2 + coordinate**2 <= 1:
hits += 1
pi_estimate = 4*(float(hits) / (len(sample)/4))
return pi_estimate

def check_key_reuse(samples):
'''
Check for key reuse between two or more messages

Returns a boolean indicating whether two messages have high or low
bitwise correspondence, which suggests key reuse.

samples - (list) Two or more samples for evaluation
'''
samples = filter(lambda x: len(x) is not 0, samples)
if len(samples) == 1:
print 'Need more than one non-null sample'
return None
total_length = total_hamming_distance = 0
for sample in samples[1:]:
compare_length = min(len(samples),len(sample))
sample_hamming_distance = hamming_distance(samples,sample)
total_hamming_distance += sample_hamming_distance
total_length += compare_length
mean_hamming_distance = total_hamming_distance / float(total_length)
return ((mean_hamming_distance < 3.25) or (mean_hamming_distance > 4.75))

def do_simple_substitution(ciphertext, pt_charset, ct_charset):
'''
Perform simple substitution based on character sets

Simplifies the use of string.translate(). If, for instance, you wish to
transform a ciphertext where 'e' is swapped with 't', you would call this
function like so:

do_simple_substitution('Simplt subeieueion ciphtrs art silly','et','te')

ciphertext - A string to translate
pt_charset - The character set of the plaintext, usually 'abcdefghijk...xyz'
ct_charset - The character set of the ciphertext
'''
#translate ciphertext to plaintext using mapping
return string.translate(ciphertext, string.maketrans(ct_charset, pt_charset))

# TODO: Implement chi square
def is_random(sample, verbose=False, boolean_results=True):
'''
Run randomness tests to determine likelihood of data being
the output of strong crypto or CSPRNG or RNG a la ent

with boolean_results=True
Returns a boolean indicating whether all tests for randomness have passed

with boolean_results=False
Returns detailed results about what tests passed/failed

sample - A string to evaluate for signs of randomness
verbose - (bool) Whether to print information about results or not
on what tests passed or failed
'''
results = {}
sample_length = len(sample)
if sample_length == 0:
return False
if sample_length < 100:
if verbose:
print '[*] Warning! Small sample size, results may be unreliable.'
# Arithmetic mean test
mean = sum([ord(char) for char in sample])/float(sample_length)
if verbose:
print '[+] Arithmetic mean of sample is '+str(mean)+'. (127.5 = random)'
if ((mean <= 110) or (mean >= 145)):
results['mean_failed'] = True
if verbose:
print '[!] Arithmetic mean of sample suggests non-random data.'
else:
results['mean_failed'] = False
# Byte and digraph count test
byte_count = generate_frequency_table(sample, map(chr,range(256)))
min_to_max = max(byte_count.values())-min(byte_count.values())
if verbose:
print '[+] Distance between lowest and highest byte frequencies is '+str(min_to_max)+'.'
print '[+] Distance for 100+ random bytes of data generally does not exceed 0.4'
if min_to_max > 0.4:
results['byte_count_failed'] = True
if verbose:
print '[!] Distance between byte frequencies suggests non-random data.'
else:
results['byte_count_failed'] = False
# Longest bit run test
binary_message = ''.join(['{0:08b}'.format(ord(char)) for char in sample])
longest_bit_run_threshold = 20
longest_run = 0
current_run = 0
prev_bit = None
for bit in binary_message:
if bit == prev_bit:
current_run += 1
else:
current_run = 0
if current_run > longest_run:
longest_run = current_run
prev_bit = bit
if verbose:
print '[+] Longest same-bit run in the provided sample is %s' % str(longest_run)
print '[+] This value generally doesn\'t exceed 20 in random data.'
results['bit_run_failed'] = (longest_run >= longest_bit_run_threshold)
if results['bit_run_failed'] and verbose:
print '[!] Long same-bit run suggests non-random data.'
# Monte Carlo estimation of Pi test
approximate_pi = 3.141592654
monte_carlo_pi_value_deviation = abs(approximate_pi - monte_carlo_pi(sample))
results['monte_carlo_failed'] = (monte_carlo_pi_value_deviation > 0.4)
if verbose:
print '[+] Deviation between the approx. value of pi and the one generated by this sample using Monte Carlo estimation is %s' % str(monte_carlo_pi_value_deviation)
print '[+] Deviation for 100+ random bytes of data generally does not exceed 0.4.'
if results['monte_carlo_failed'] and verbose:
print '[!] Deviation exceeds 0.4. If no other randomness tests failed, this data may be compressed, not encrypted or random.'
# Compression ratio test
compression_ratio = len(zlib.compress(sample,9)) / float(len(sample))
if verbose:
print '[+] Zlib best compression ratio is {0:.0f}%'.format(compression_ratio * 100)
print '[+] Compression ratio for random data is unlikely to be lower than 95%.'
results['compression_ratio_failed'] = (compression_ratio <= .95)
if boolean_results:
if any(results.values()):
if verbose:
print '[!] One or more tests for randomness suggests non-random data.'
print '[!] This data may be the result of weak encryption like XOR.'
print '[!] This may also suggest a fixed IV or ECB mode.'
print '[!] This data may also be simply compressed or in a proprietary format.'
return False
else:
if verbose:
print '[+] This data has passed all randomness tests performed.'
print '[+] This suggests data generated by a RNG, CSPRNG, or strong encryption.'
return True
else:
if verbose:
if sum(results.values()) == 1:
if results['monte_carlo_failed']:
print '[+] Only the Monte Carlo Pi generation test has failed. This may indicate that the data is not encrypted, but simply compressed.'
elif results['bit_run_failed']:
print '[+] Only the longest-bit-run test has failed. This suggests that certain portions of the data are not encrypted.'
return results

def gcd(a,b):
'''
Wrapper around extended_gcd() that simply returns the GCD alone.
'''
return extended_gcd(a,b)

def extended_gcd(a, b):
'''
Euclid's GCD algorithm, but with the addition that the last x and y values are returned.

a, b - Two integers to find common factors for

Returns (Last X value, Last Y value, Greatest common divisor)
'''
x,y = 0, 1
lastx, lasty = 1, 0

while b:
a, (q, b) = b, divmod(a,b)
x, lastx = lastx-q*x, x
y, lasty = lasty-q*y, y

return (lastx, lasty, a)

def chinese_remainder_theorem(items):
'''
The Chinese Remainder Theorem algorithm.

items - A list of 2-tuples such as [(a1, n1),(a2, n2)] that map to congruences:
a1 is congruent to x mod n1
a2 is congruent to x mod n2
'''
N = 1
for a, n in items:
N *= n

result = 0
for a, n in items:
m = N/n
r, s, d = extended_gcd(n, m)
if d != 1:
raise "Input not pairwise co-prime"
result += a*s*m

return result % N, N

def detect_block_cipher(ciphertext):
'''
Detect block cipher by length of ciphertext

Return largest identified block size, or False if none

ciphertext - (string) A sample to be evaluated for common block sizes
'''
for candidate_blocksize in [32,16,8]:
if len(ciphertext) % candidate_blocksize == 0:
return candidate_blocksize
return False

def detect_plaintext(candidate_text, pt_freq_table=frequency.frequency_tables['english_letters'], detect_words=True, common_words=frequency.common_words['english'], individual_scores=False):
'''
Return score for likelihood that string is plaintext
in specified language as a measure of deviation from
expected frequency values (lower is better)

candidate_text - (string) The sample to check for plaintext-like properties

pt_freq_table - Expected frequency distribution for the plaintext, as generated
by generate_frequency_table(). If only individual character frequency should
be matched, ensure you're using a frequency table with only single character
frequencies. If you're using the built-in tables, these are prefixed with
'single_'.

detect_words - (bool) Use a list of strings expected in the correct plaintext,
aka 'cribs'.
This can be used in a number of ways. For instance, when attempting to decrypt
firmware, '\x00\x00\x00\x00\x00' may be a useful crib. When attempting to
decrypt a PNG file, 'IHDR', 'IDAT', and 'IEND' are useful cribs.

common_words - (list of strings) Words that are likely to appear in the plaintext.
Requires detect_words=True.

individual_scores - (bool) Whether or not to return a tuple with individual scores.
'''

# generate score as deviation from expected character frequency
pt_freq_table_keys = pt_freq_table.keys()
candidate_dict = generate_frequency_table(candidate_text, pt_freq_table_keys)
char_deviation_score = 0
for char in pt_freq_table_keys:
char_deviation_score += abs(candidate_dict[char]-pt_freq_table[char])

# generate score as total number of letters in common words found in sample
word_count_score = 0
if detect_words:
word_count_score = count_words(candidate_text, common_words=common_words)

if individual_scores:
return (char_deviation_score, word_count_score)
else:
if word_count_score == 0:
score = 1
else:
score = 1.0/word_count_score
score += char_deviation_score
return score

def generate_optimized_charset_from_frequency(freq_table, include_zero_freq=False):
'''
Given a character frequency table such as those returned by generate_frequency_table(),
return a string with only single characters sorted by frequency of occurance descending
'''
# Filter out frequency items to only single characters
single_char_freq_table = dict(filter(lambda x: len(x)==1, freq_table.items()))
# Filter out items which never occur
if not include_zero_freq:
single_char_freq_table = dict(filter(lambda x: x != 0, single_char_freq_table.items()))
# Sort items by frequency, concatenate characters and return as a single string
return ''.join([x for x in sorted(single_char_freq_table.items(), key=lambda x: x, reverse=True)])

def generate_frequency_table(text,charset):
'''
Generate a character frequency table for a given text
and charset as dict with character or string as key and
frequency of appearance as value expressed as a decimal
percentage

text - A sample of plaintext to analyze for frequency data
charset - (list of strings) The set of items to count in the plaintext
such as ['a','b','c', ... 'z','aa','ab','ac', ... 'zz']
'''
freq_table = {}
text_len = 0
for char in charset:
freq_table[char] = 0
for char in text:
if char in charset:
freq_table[char] += 1
text_len += 1
for multigraph in filter(lambda x: len(x)>1,charset):
freq_table[multigraph] = string.count(text, multigraph)
# Normalize frequencies with length of text
for key in freq_table.keys():
if text_len != 0:
freq_table[key] /= float(text_len)
else:
freq_table[key] = 0
return freq_table

def generate_optimized_charset(text, include_zero_freq=False):
'''
Given a sample text, generate a frequency table and
convert it to a string of characters sorted by frequency
of appearance in the text. This can be used directly in
some of the other cryptanalib functions, such as our

(string) text - The corpus of text from which to learn
frequency data.
'''

all_chars = map(chr, range(256))
freq_table = generate_frequency_table(text, charset=all_chars)
return generate_optimized_charset_from_frequency(freq_table, include_zero_freq=include_zero_freq)

def hamming_distance(string1, string2):
'''
Calculate and return bitwise hamming distance between two strings

string1 - The first string to compare
string2 - The second string to compare
'''
distance = 0
for char1, char2 in zip(string1, string2):
for digit1, digit2 in zip('{0:08b}'.format(ord(char1)),'{0:08b}'.format(ord(char2))):
if digit1 != digit2:
distance += 1
return distance

'''
Output masking - mask all characters besides those in the provided character
set with dots.

Parameters:
(string) text - output to mask
(string) charset - string containing acceptable characters
'''
all_chars = output_chars = map(chr,range(256))
charset = set(charset)
for charnum in range(256):
if all_chars[charnum] not in charset:
output_chars[charnum] = '.'
return string.translate(text,''.join(output_chars))

def string_to_long(instring):
'''
Take a raw string and convert it to a number

instring - String to convert
'''
return long(instring.encode("hex"),16)

def long_to_string(inlong):
'''
Take a long and convert it to a string

inlong - Long to convert
'''
hex_encoded = hex(inlong)[2:-1]
if len(hex_encoded) % 2 == 1:
return ('0'+hex_encoded).decode('hex')
else:
return hex_encoded.decode('hex')

def split_into_blocks(ciphertext,blocksize):
'''
Split a string into blocks of length blocksize

ciphertext - A string to be split
blocksize - The size in bytes of blocks to output
'''
ciphertext_len = len(ciphertext)
return [ciphertext[offset:offset+blocksize] for offset in xrange(0,ciphertext_len,blocksize)]

def sxor(string1, string2):
'''
XOR two strings and return the result up to the length
of the shorter string

string1 - The first string to be XORed
string2 - The second string to be XORed
'''
return ''.join(chr(ord(chr1)^ord(chr2)) for chr1, chr2 in zip(string1,string2))

def count_words(candidate_text, common_words=frequency.common_words['english'], case_sensitive=True):
'''
Count the instances of common words in the expected plaintext
language, return the total number of characters matched in each
word

candidate_text - (string) Sample to analyze
common_words - (list) Sequences expected to appear in the text
case_sensitive - (bool) Whether or not to match case sensitively
'''
score = 0

for word in common_words:
if not case_sensitive:
word = word.lower()
num_found = candidate_text.count(word)
if num_found > 0:
score += num_found * len(word)

return score

'''
Polybius square generator. Returns a list of strings of equal
length, either 5x5 or 6x6 depending on whether extended
Polybius mode is on. Assumes I/J are represented as one letter

password - (string) The password to use when generating the polybius square
extended - (bool) Set to True to use a 6x6 square instead of a 5x5
'''
alphabet = string.lowercase
if extended == True:
alphabet += string.digits
else:
alphabet = string.replace(string.lowercase, 'j', '')
if any([x not in alphabet for x in set(password)]):
return False
unique_letters = []
if letter not in unique_letters:
unique_letters.append(letter)
for letter in unique_letters:
alphabet = string.replace(alphabet, letter, '')
for letter in unique_letters[::-1]:
alphabet = letter + alphabet
ps = []
alphabet_len = len(alphabet)
grid_size = 5 + int(extended) # Not necessary, but looks cleaner
for index in xrange(0,alphabet_len,grid_size):
ps.append(alphabet[index:index+grid_size])
return ps

def polybius_decrypt(ps, ciphertext):
'''
Decrypt given a polybius square (such as one generated
by make_polybius_square() ) and a ciphertext.

ps - A polybius square as generated by make_polybius_square()
ciphertext - A string to decrypt
'''
ct_len = len(ciphertext)
if (ct_len % 2) != 0:
return False
digraphs = []
plaintext = ''
for index in xrange(0,ct_len,2):
digraphs.append(ciphertext[index:index+2])
for digraph in digraphs:
x = int(digraph) - 1
y = int(digraph) - 1
plaintext += ps[y][x]
return plaintext

def detect_ecb(ciphertext):
'''
Attempts to detect use of ECB by detecting duplicate blocks using common
block sizes.

ciphertext - A string to analyze for the indicators of ECB mode
'''
ciphertext_len = len(ciphertext)
for blocksize in [32,16,8]:
if ciphertext_len % blocksize == 0:
blocks = split_into_blocks(ciphertext,blocksize)
seen = set()
for block in blocks:
if block in seen:
return (True, blocksize, block)
else:
return (False, 0, '')

'''
PKCS7 padding remove - returns unpadded string if successful, returns False if unsuccessful

text - The text to pkcs7-unpad
blocksize - The blocksize of the text
'''
last_byte = ord(text[-1:])
if last_byte > blocksize:
return False
if text[-last_byte:] != chr(last_byte)*last_byte:
return False
else:
return text[:-last_byte]

'''

text - The text to pkcs7-pad
blocksize - The blocksize of the text
'''