python source code of kernel

#!/usr/bin/env python3
"""Representation of computational kernel for performance model analysis and helper functions."""
import shutil
import textwrap
from copy import deepcopy
import operator
import tempfile
import subprocess
import os
import os.path
import sys
import numbers
import collections
from datetime import datetime
from functools import reduce, lru_cache
import string
from collections import defaultdict
from itertools import chain
import random
import atexit
import re
from contextlib import contextmanager
import fcntl

import sympy
from sympy.utilities.lambdify import implemented_function
from sympy.parsing.sympy_parser import parse_expr
import numpy
import compress_pickle

from pycparser import CParser, c_ast, plyparser
from pycparser.c_generator import CGenerator

from . import kerncraft
from . import incore_model
from .pycparser_utils import clean_code, replace_id


@contextmanager
def set_recursionlimit(new_limit):
    old_limit = sys.getrecursionlimit()
    sys.setrecursionlimit(new_limit)
    try:
        yield new_limit
    finally:
        sys.setrecursionlimit(old_limit)


@lru_cache()
def symbol_pos_int(*args, **kwargs):
    """Create a sympy.Symbol with positive and integer assumptions."""
    kwargs.update({'positive': True,
                   'integer': True})
    return sympy.Symbol(*args, **kwargs)


def string_to_sympy(s):
    """Convert any string to a sympy object or None."""
    if isinstance(s, int):
        return sympy.Integer(s)
    elif isinstance(s, list):
        return tuple([string_to_sympy(e) for e in s])
    elif s is None:
        return None
    else:
        # Step 1 build expression with the whole alphabet redefined:
        local_dict = {c: symbol_pos_int(c) for c in s if c in string.ascii_letters}
        # TODO find nicer solution for N and other pre-mapped letters
        preliminary_expr = parse_expr(s, local_dict=local_dict)
        # Replace all free symbols with positive integer versions:
        local_dict.update(
            {s.name: symbol_pos_int(s.name) for s in preliminary_expr.free_symbols})
        return parse_expr(s, local_dict=local_dict)


def prefix_indent(prefix, textblock, later_prefix=' '):
    """
    Prefix and indent all lines in *textblock*.

    *prefix* is a prefix string
    *later_prefix* is used on all but the first line, if it is a single character
                   it will be repeated to match length of *prefix*
    """
    textblock = textblock.split('\n')
    line = prefix + textblock[0] + '\n'
    if len(later_prefix) == 1:
        later_prefix = ' '*len(prefix)
    line = line + '\n'.join([later_prefix + x for x in textblock[1:]])
    if line[-1] != '\n':
        return line + '\n'
    else:
        return line


def transform_multidim_to_1d_decl(decl):
    """
    Transform ast of multidimensional declaration to a single dimension declaration.

    In-place operation!

    Returns name and dimensions of array (to be used with transform_multidim_to_1d_ref())
    """
    dims = []
    type_ = decl.type
    while type(type_) is c_ast.ArrayDecl:
        dims.append(type_.dim)
        type_ = type_.type

    if dims:
        # Multidimensional array
        decl.type.dim = reduce(lambda l, r: c_ast.BinaryOp('*', l, r), dims)
        decl.type.type = type_

    return decl.name, dims


def transform_multidim_to_1d_ref(aref, dimension_dict):
    """
    Transform ast of multidimensional reference to a single dimension reference.

    In-place operation!
    """
    dims = []
    name = aref
    while type(name) is c_ast.ArrayRef:
        dims.append(name.subscript)
        name = name.name

    subscript_list = []
    for i, d in enumerate(dims):
        if i == 0:
            subscript_list.append(d)
        else:
            subscript_list.append(c_ast.BinaryOp('*', d, reduce(
                lambda l, r: c_ast.BinaryOp('*', l, r),
                dimension_dict[name.name][-1:-i-1:-1])))

    aref.subscript = reduce(
        lambda l, r: c_ast.BinaryOp('+', l, r), subscript_list)
    aref.name = name


def transform_array_decl_to_malloc(decl, with_init=True):
    """
    Transform ast of "type var_name[N]" to "type* var_name = aligned_malloc(sizeof(type)*N, 32)"

    In-place operation.

    :param with_init: if False, ommit malloc
    """
    if type(decl.type) is not c_ast.ArrayDecl:
        # Not an array declaration, can be ignored
        return

    type_ = c_ast.PtrDecl([], decl.type.type)
    if with_init:
        decl.init = c_ast.FuncCall(
            c_ast.ID('aligned_malloc'),
            c_ast.ExprList([
                c_ast.BinaryOp(
                    '*',
                    c_ast.UnaryOp(
                        'sizeof',
                        c_ast.Typename(None, [], c_ast.TypeDecl(
                            None, [], decl.type.type.type))),
                    decl.type.dim),
                c_ast.Constant('int', '32')]))
    decl.type = type_


def find_node_type(ast, node_type):
    """Return list of nodes with *node_type* in *ast*."""
    if type(ast) is node_type:
        return [ast]
    elif type(ast) is list:
        return reduce(operator.add, list(map(lambda a: find_node_type(a, node_type), ast)), [])
    elif ast is None:
        return []
    else:
        return reduce(operator.add,
                      [find_node_type(o[1], node_type) for o in ast.children()], [])


def find_pragmas(ast):
    """Return list of pragmas in AST."""
    if type(ast) is c_ast.Pragma:
        return [ast]


def force_iterable(f):
    """Will make any functions return an iterable objects by wrapping its result in a list."""
    def wrapper(*args, **kwargs):
        r = f(*args, **kwargs)
        if hasattr(r, '__iter__'):
            return r
        else:
            return [r]
    return wrapper


def reduce_path(path):
    """Reduce absolute path to relative (if shorter) for easier readability."""
    relative_path = os.path.relpath(path)
    if len(relative_path) < len(path):
        return relative_path
    else:
        return path


class Kernel(object):
    """Kernel information with functons to analyze and report access patterns."""

    # Datatype sizes in bytes
    datatypes_size = {('double', '_Complex'): 16,
                      ('float', '_Complex'): 8,
                      ('double',): 8,
                      ('float',): 4}

    def __init__(self, machine=None):
        """Create kernel representation."""
        self._machine = machine
        self._loop_stack = []
        self.variables = {}
        self.sources = {}
        self.destinations = {}
        self._flops = {}
        self.datatype = None
        self.constants = None

        self.clear_state()

    def check(self):
        """Check that information about kernel makes sens and is valid."""
        datatypes = [v[0] for v in self.variables.values()]
        assert len(set(datatypes)) <= 1, 'mixing of datatypes within a kernel is not supported.'

        # TODO add combine all tests here

    def set_constant(self, name, value):
        """
        Set constant of name to value.

        :param name: may be a str or a sympy.Symbol
        :param value: must be an int
        """
        assert isinstance(name, str) or isinstance(name, sympy.Symbol), \
            "constant name needs to be of type str, unicode or a sympy.Symbol"
        assert type(value) is int, "constant value needs to be of type int"
        if isinstance(name, sympy.Symbol):
            self.constants[name] = value
        else:
            self.constants[symbol_pos_int(name)] = value
        # Ensure constants will always be sorted by keys
        self.constants = collections.OrderedDict(
            sorted(self.constants.items(), key=lambda i: str(i[0])))

    def set_variable(self, name, type_, size):
        """
        Register variable of name and type_, with a (multidimensional) size.

        :param name: variable name as it appears in code
        :param type_: may be any key from Kernel.datatypes_size (typically float or double)
        :param size: either None for scalars or an n-tuple of ints for an n-dimensional array
        """
        assert type_ in self.datatypes_size, 'only float, double, float _Complex and ' \
                                             'double _Complex variables are supported'
        if self.datatype is None:
            self.datatype = type_
        else:
            assert type_ == self.datatype, 'mixing of datatypes within a kernel is not supported.'
        assert type(size) in [tuple, type(None)], 'size has to be defined as tuple or None'
        self.variables[name] = (type_, size)

    def clear_state(self):
        """Clear mutable internal states (constants, asm_blocks and asm_block_idx)."""
        self.constants = collections.OrderedDict()
        self.subs_consts.cache_clear()  # clear LRU cache of function

    @lru_cache(40)
    def subs_consts(self, expr):
        """Substitute constants in expression unless it is already a number."""
        if isinstance(expr, numbers.Number):
            return expr
        else:
            return expr.subs(self.constants)

    def array_sizes(self, in_bytes=False, subs_consts=False):
        """
        Return a dictionary with all arrays sizes.

        :param in_bytes: If True, output will be in bytes, not element counts.
        :param subs_consts: If True, output will be numbers and not symbolic.

        Scalar variables are ignored.
        """
        var_sizes = {}

        for var_name, var_info in self.variables.items():
            var_type, var_size = var_info

            # Skiping scalars
            if var_size is None:
                continue

            var_sizes[var_name] = reduce(operator.mul, var_size, 1)

            # Multiply by bytes per element if requested
            if in_bytes:
                element_size = self.datatypes_size[var_type]
                var_sizes[var_name] *= element_size

        if subs_consts:
            return {k: self.subs_consts(v) for k, v in var_sizes.items()}
        else:
            return var_sizes

    def _calculate_relative_offset(self, name, access_dimensions):
        """
        Return the offset from the iteration center in number of elements.

        The order of indices used in access is preserved.
        """
        # TODO to be replaced with compile_global_offsets
        offset = 0
        base_dims = self.variables[name][1]

        for dim, offset_info in enumerate(access_dimensions):
            offset_type, idx_name, dim_offset = offset_info
            assert offset_type == 'rel', 'Only relative access to arrays is supported at the moment'

            if offset_type == 'rel':
                offset += self.subs_consts(
                    dim_offset*reduce(operator.mul, base_dims[dim+1:], sympy.Integer(1)))
            else:
                # should not happen
                pass

        return offset

    def _remove_duplicate_accesses(self):
        """
        Remove duplicate source and destination accesses
        """
        self.destinations = {var_name: set(acs) for var_name, acs in self.destinations.items()}
        self.sources = {var_name: set(acs) for var_name, acs in self.sources.items()}

    def access_to_sympy(self, var_name, access):
        """
        Transform a (multidimensional) variable access to a flattend sympy expression.

        Also works with flat array accesses.
        """
        if var_name not in self.variables:
            raise ValueError("No declaration of variable {!r} found.".format(var_name))
        base_sizes = self.variables[var_name][1]

        expr = sympy.Number(0)

        for dimension, a in enumerate(access):
            base_size = reduce(operator.mul, base_sizes[dimension+1:], sympy.Integer(1))

            expr += base_size*a

        return expr

    def iteration_length(self, dimension=None):
        """
        Return the number of global loop iterations that are performed.

        If dimension is not None, it is the loop dimension that is returned
        (-1 is the inner most loop and 0 the outermost)
        """
        total_length = 1

        if dimension is not None:
            loops = [self._loop_stack[dimension]]
        else:
            loops = reversed(self._loop_stack)

        for var_name, start, end, incr in loops:
            # This unspools the iterations:
            length = end-start
            total_length = total_length*length
        return self.subs_consts(total_length)

    def get_loop_stack(self, subs_consts=False):
        """Yield loop stack dictionaries in order from outer to inner."""
        for l in self._loop_stack:
            if subs_consts:
                yield {'index': l[0],
                       'start': self.subs_consts(l[1]),
                       'stop': self.subs_consts(l[2]),
                       'increment': self.subs_consts(l[3])}
            else:
                yield {'index': l[0], 'start': l[1], 'stop': l[2], 'increment': l[3]}

    def index_order(self, sources=True, destinations=True):
        """
        Return the order of indices as they appear in array references.

        Use *source* and *destination* to filter output
        """
        if sources:
            arefs = chain(*self.sources.values())
        else:
            arefs = []
        if destinations:
            arefs = chain(arefs, *self.destinations.values())

        ret = []
        for a in [aref for aref in arefs if aref is not None]:
            ref = []
            for expr in a:
                ref.append(expr.free_symbols)
            ret.append(ref)

        return ret

    def compile_sympy_accesses(self, sources=True, destinations=True):
        """
        Return a dictionary of lists of sympy accesses, for each variable.

        Use *source* and *destination* to filter output
        """
        sympy_accesses = defaultdict(list)
        # Compile sympy accesses
        for var_name in self.variables:
            if sources:
                for r in self.sources.get(var_name, []):
                    if r is None:
                        continue
                    sympy_accesses[var_name].append(self.access_to_sympy(var_name, r))
            if destinations:
                for w in self.destinations.get(var_name, []):
                    if w is None:
                        continue
                    sympy_accesses[var_name].append(self.access_to_sympy(var_name, w))

        return sympy_accesses

    def compile_relative_distances(self, sympy_accesses=None):
        """
        Return load and store distances between accesses.

        :param sympy_accesses: optionally restrict accesses, default from compile_sympy_accesses()

        e.g. if accesses are to [+N, +1, -1, -N], relative distances are [N-1, 2, N-1]

        returned is a dict of list of sympy expressions, for each variable
        """
        if sympy_accesses is None:
            sympy_accesses = self.compile_sympy_accesses()

        sympy_distances = defaultdict(list)
        for var_name, accesses in sympy_accesses.items():
            for i in range(1, len(accesses)):
                sympy_distances[var_name].append((accesses[i-1]-accesses[i]).simplify())

        return sympy_distances

    def global_iterator_to_indices(self, git=None):
        """
        Return sympy expressions translating global_iterator to loop indices.

        If global_iterator is given, an integer is returned
        """
        # unwind global iteration count into loop counters:
        base_loop_counters = {}
        global_iterator = symbol_pos_int('global_iterator')
        idiv = implemented_function(sympy.Function(str('idiv')), lambda x, y: x//y)
        total_length = 1
        last_incr = 1
        for var_name, start, end, incr in reversed(self._loop_stack):
            loop_var = symbol_pos_int(var_name)

            # This unspools the iterations:
            length = end-start  # FIXME is incr handled correct here?
            counter = start+(idiv(global_iterator*last_incr, total_length)*incr) % length
            total_length = total_length*length
            last_incr = incr

            base_loop_counters[loop_var] = sympy.lambdify(
                global_iterator,
                self.subs_consts(counter), modules=[numpy, {'Mod': numpy.mod}])

            if git is not None:
                try:  # Try to resolve to integer if global_iterator was given
                    base_loop_counters[loop_var] = sympy.Integer(self.subs_consts(counter))
                    continue
                except (ValueError, TypeError):
                    base_loop_counters[loop_var] = base_loop_counters[loop_var](git)

        return base_loop_counters

    @lru_cache(1)
    def global_iterator(self):
        """
        Return global iterator sympy expression
        """
        global_iterator = sympy.Integer(0)
        total_length = sympy.Integer(1)
        for var_name, start, end, incr in reversed(self._loop_stack):
            loop_var = symbol_pos_int(var_name)
            length = end - start  # FIXME is incr handled correct here?
            global_iterator += (loop_var - start) * total_length
            total_length *= length
        return global_iterator

    def indices_to_global_iterator(self, indices):
        """
        Transform a dictionary of indices to a global iterator integer.

        Inverse of global_iterator_to_indices().
        """
        global_iterator = self.subs_consts(self.global_iterator().subs(indices))
        return global_iterator

    def max_global_iteration(self):
        """Return global iterator with last iteration number"""
        return self.indices_to_global_iterator({
            symbol_pos_int(var_name): end-1 for var_name, start, end, incr in self._loop_stack
        })

    def compile_global_offsets(self, iteration=0, spacing=0):
        """
        Return load and store offsets on a virtual address space.

        :param iteration: controls the inner index counter
        :param spacing: sets a spacing between the arrays, default is 0

        All array variables (non scalars) are laid out linearly starting from 0. An optional
        spacing can be set. The accesses are based on this layout.

        The iteration 0 is the first iteration. All loops are mapped to this linear iteration
        space.

        Accesses to scalars are ignored.

        Returned are load and store byte-offset pairs for each iteration.
        """
        global_load_offsets = []
        global_store_offsets = []

        if isinstance(iteration, range):
            iteration = numpy.arange(iteration.start, iteration.stop, iteration.step, dtype='O')
        else:
            if not isinstance(iteration, collections.abc.Sequence):
                iteration = [iteration]
            iteration = numpy.array(iteration, dtype='O')

        # loop indices based on iteration
        # unwind global iteration count into loop counters:
        base_loop_counters = self.global_iterator_to_indices()
        total_length = self.iteration_length()

        assert iteration.max() < self.subs_consts(total_length), \
            "Iterations go beyond what is possible in the original code ({} vs {}). " \
            "One common reason, is that the iteration length are unrealistically small.".format(
                iteration.max(), self.subs_consts(total_length))

        # Get sizes of arrays and base offsets for each array
        var_sizes = self.array_sizes(in_bytes=True, subs_consts=True)
        base_offsets = {}
        base = 0
        # Always arrange arrays in alphabetical order in memory, for reproducibility
        for var_name, var_size in sorted(var_sizes.items(), key=lambda v: v[0]):
            base_offsets[var_name] = base
            array_total_size = self.subs_consts(var_size + spacing)
            # Add bytes to align by 64 byte (typical cacheline size):
            array_total_size = ((int(array_total_size) + 63) & ~63)
            base += array_total_size

        # Gather all read and write accesses to the array:
        for var_name, var_size in var_sizes.items():
            element_size = self.datatypes_size[self.variables[var_name][0]]
            for r in self.sources.get(var_name, []):
                offset_expr = self.access_to_sympy(var_name, r)
                # Ignore accesses that always go to the same location (constant offsets)
                if not any([s in base_loop_counters.keys() for s in offset_expr.free_symbols]):
                    continue
                offset = force_iterable(sympy.lambdify(
                    base_loop_counters.keys(),
                    self.subs_consts(
                        offset_expr*element_size
                        + base_offsets[var_name]), numpy))
                # TODO possibly differentiate between index order
                global_load_offsets.append(offset)
            for w in self.destinations.get(var_name, []):
                offset_expr = self.access_to_sympy(var_name, w)
                # Ignore accesses that always go to the same location (constant offsets)
                if not any([s in base_loop_counters.keys() for s in offset_expr.free_symbols]):
                    continue
                offset = force_iterable(sympy.lambdify(
                    base_loop_counters.keys(),
                    self.subs_consts(
                        offset_expr*element_size
                        + base_offsets[var_name]), numpy))
                # TODO possibly differentiate between index order
                global_store_offsets.append(offset)
                # TODO take element sizes into account, return in bytes

        # Generate numpy.array for each counter
        counter_per_it = [v(iteration) for v in base_loop_counters.values()]

        # Old and slow - left for reference
        ## Data access as they appear with iteration order
        #return zip_longest(zip(*[o(*counter_per_it) for o in global_load_offsets]),
        #                   zip(*[o(*counter_per_it) for o in global_store_offsets]),
        #                   fillvalue=None)

        # Data access as they appear with iteration order
        load_offsets = []
        for o in global_load_offsets:
            load_offsets.append(o(*counter_per_it))
        # Convert to numpy ndarray and transpose to get offsets per iterations
        load_offsets = numpy.asarray(load_offsets).T

        store_offsets = []
        for o in global_store_offsets:
            store_offsets.append(o(*counter_per_it))
        store_offsets = numpy.asarray(store_offsets).T

        # Combine loads and stores
        store_width = store_offsets.shape[1] if len(store_offsets.shape) > 1 else 0
        dtype = [('load', load_offsets.dtype, (load_offsets.shape[1],)),
                 ('store', store_offsets.dtype, (store_width,))]
        offsets = numpy.empty(max(load_offsets.shape[0], store_offsets.shape[0]), dtype=dtype)
        offsets['load'] = load_offsets
        offsets['store'] = store_offsets

        return offsets

    @property
    def bytes_per_iteration(self):
        """
        Consecutive bytes written out per high-level iterations (as counted by loop stack).

        Is used to compute number of iterations per cacheline.
        """
        # TODO Find longst consecutive writes to any variable and use as basis
        var_name = list(self.destinations)[0]
        var_type = self.variables[var_name][0]
        # FIXME this is correct most of the time, but not guaranteed:
        # Multiplying datatype size with step increment of inner-most loop
        return self.datatypes_size[var_type] * self._loop_stack[-1][3]

    def print_kernel_info(self, output_file=sys.stdout):
        """Print kernel information in human readble format."""
        table = ('     idx |        min        max       step\n' +
                 '---------+---------------------------------\n')
        for l in self._loop_stack:
            table += '{:>8} | {!r:>10} {!r:>10} {!r:>10}\n'.format(*l)
        print(prefix_indent('loop stack:        ', table), file=output_file)

        table = ('    name |  offsets   ...\n' +
                 '---------+------------...\n')
        for name, offsets in list(self.sources.items()):
            prefix = '{:>8} | '.format(name)
            right_side = '\n'.join(['{!r:}'.format(o) for o in offsets])
            table += prefix_indent(prefix, right_side, later_prefix='         | ')
        print(prefix_indent('data sources:      ', table), file=output_file)

        table = ('    name |  offsets   ...\n' +
                 '---------+------------...\n')
        for name, offsets in list(self.destinations.items()):
            prefix = '{:>8} | '.format(name)
            right_side = '\n'.join(['{!r:}'.format(o) for o in offsets])
            table += prefix_indent(prefix, right_side, later_prefix='         | ')
        print(prefix_indent('data destinations: ', table), file=output_file)

        table = (' op | count \n' +
                 '----+-------\n')
        for op, count in list(self._flops.items()):
            table += '{:>3} | {:>4}\n'.format(op, count)
        table += '     =======\n'
        table += '      {:>4}'.format(sum(self._flops.values()))
        print(prefix_indent('FLOPs:     ', table), file=output_file)

    def print_variables_info(self, output_file=sys.stdout):
        """Print variables information in human readble format."""
        table = ('    name |   type          size             \n' +
                 '---------+----------------------------------\n')
        for name, var_info in list(self.variables.items()):
            table += '{:>8} | {:>15} {!s:<10}\n'.format(name, ' '.join(var_info[0]), var_info[1])
        print(prefix_indent('variables: ', table), file=output_file)

    def print_constants_info(self, output_file=sys.stdout):
        """Print constants information in human readble format."""
        table = ('    name | value     \n' +
                 '---------+-----------\n')
        for name, value in list(self.constants.items()):
            table += '{!s:>8} | {:<10}\n'.format(name, value)
        print(prefix_indent('constants: ', table), file=output_file)

    def incore_analysis(self, *args, **kwargs):
        """Run in-core analysis (e.g., IACA)."""
        raise NotImplementedError("Kernel does not support compilation and in-core analysis. "
                                  "Try a different model or kernel input format.")

    def build_executable(self, *args, **kwargs):
        """Compile and build binary."""
        raise NotImplementedError("Kernel does not support compilation. Try a different model or "
                                  "kernel input format.")


class KernelCode(Kernel):
    """
    Kernel information gathered from code using pycparser.

    This version allows compilation and generation of code for iaca and likwid benchmarking
    """

    def __init__(self, kernel_code, machine, filename=None, keep_intermediates=True):
        """
        Create kernel representation from source code str and machine object.

        :param kernel_code: string with kernel code file content
        :param machine: MachineModel object
        :param filename: used for prettier error messages and as storage location prefix
        :param keep_intermediates: if set to True, intermediate files (for and by compilation) will
                                   be preserved. If set to False, they will be deleted after use.
        """
        super(KernelCode, self).__init__(machine=machine)

        # Initialize state
        self.clear_state()

        self.kernel_code = kernel_code
        self._filename = filename
        self._keep_intermediates = keep_intermediates
        parser = CParser()
        try:
            self.kernel_ast = parser.parse(self._strip_comments(self._as_function()),
                                           filename=filename).ext[0].body
        except plyparser.ParseError as e:
            print('Error parsing kernel code:', e)
            sys.exit(1)

        self._process_code()

        self.check()
    
    def get_intermediate_location(
            self, name, machine_and_compiler_dependent=True, other_dependencies=[]):
        """
        Get a suitable and reproduceble file path string for intermediate files.

        :param name: filename to use for caching
        :param machine_and_compiler_dependent: set to False if file content does not depend on
                                               machine file or compiler settings
        :param other_dependencies: list of strings to use in path. slashes are stripped
        """
        if self._filename:
            base_name = os.path.join(os.path.dirname(self._filename),
                                     '.' + os.path.basename(self._filename) + '_kerncraft')
        else:
            base_name = tempfile.mkdtemp()

        if not self._keep_intermediates:
            # Remove directory and all content up on program exit
            atexit.register(shutil.rmtree, base_name, ignore_errors=True)

        if machine_and_compiler_dependent:
            compiler, compiler_args = self._machine.get_compiler()
            compiler_args = '_'.join(compiler_args).replace('/', '')
            base_name += '/{}/{}/{}/'.format(
                self._machine.get_identifier(), compiler, compiler_args)
        
        for i in other_dependencies:
            base_name = os.path.join(base_name, i.replace('/', ''))

        # Create dirs recursively
        os.makedirs(base_name, exist_ok=True)

        # Build actual file path
        file_path = os.path.join(base_name, name)

        return reduce_path(file_path)

    def _check_freshness(self, file_path):
        """Return True, if file_path exists and file is up-to-date."""
        if os.path.exists(file_path):
            file_modified = datetime.utcfromtimestamp(os.stat(file_path).st_mtime)
            # Check if file is newer than machine file, kerncraft code and input kernel code
            if (file_modified > self._machine.get_last_modified_datetime() and  # machine file
                file_modified > kerncraft.get_last_modified_datetime() and  # kerncraft
                file_modified > datetime.utcfromtimestamp(os.stat(self._filename).st_mtime)):
                    return True
        return False

    def lock_intermediate(self, file_path):
        """
        Lock intermediate. Depending on state, readable or writable.

        A sepeate file_path+'.lock' file is used. It is the callees responsibility to close the lock

        :param file_path: path to baser lock file on.

        :return: tuple: (acquired lock mode, lock file pointer)

        lock modes are: fcntl.LOCK_SH, which means file_path is good for read-only access
                        fcntl.LOCK_EX, which means file_path is good for write access and MUST be 
                        create/updated.
        """
        lock_filename = file_path + '.lock'
        # 1. Open lockfile (create and write)
        lock_fp = open(lock_filename, 'w')
        # 2. Acquire SH lock (blocking)
        fcntl.flock(lock_fp, fcntl.LOCK_SH)
        # 3. Check existence and freshness
        if self._check_freshness(file_path):
            # -> READ MODE
            return (fcntl.LOCK_SH, lock_fp)
        # 4. Release SH lock (to allow other processes already awaiting an exclusive lock to enter)
        fcntl.flock(lock_fp, fcntl.LOCK_UN)
        # 5. Acquire EX lock (blocking)
        fcntl.flock(lock_fp, fcntl.LOCK_EX)
        # 6. Check if file is now fresh (things may have changed!)
        if self._check_freshness(file_path):
            # Acquire SH lock (this will replace EX lock in-place)
            fcntl.flock(lock_fp, fcntl.LOCK_SH)
            # -> READ MODE
            return (fcntl.LOCK_SH, lock_fp)
        # else: -> WRITE MODE
        return (fcntl.LOCK_EX, lock_fp)

    def _strip_comments(self, code):
        clean_code = []
        for l in code.split('\n'):
            i = l.find('//')
            if i > -1:
                clean_code.append(l[:i])
            else:
                clean_code.append(l)
        return '\n'.join(clean_code)

    def print_kernel_code(self, output_file=sys.stdout):
        """Print source code of kernel."""
        print(self.kernel_code, file=output_file)

    def _as_function(self, func_name='test', filename=None):
        if filename is None:
            filename = ''
        else:
            filename = '"{}"'.format(filename)
        return '#line 0 \nvoid {}() {{\n#line 1 {}\n{}\n#line 999 \n}}'.format(
            func_name, filename, self.kernel_code)

    def clear_state(self):
        """Clear mutable internal states."""
        super(KernelCode, self).clear_state()
        self.asm_block = None
        self.pointer_increment = None

    def _process_code(self):
        assert type(self.kernel_ast) is c_ast.Compound, "Kernel has to be a compound statement"

        declarations = []
        loop_nest = []
        swaps = []

        # Check that code follows sections:
        # Section in code are (in this specific order):
        # 'declarations' (any number of array and scalar variable declarations)
        # 'loopnest' (a single loop nest)
        # 'swaps' (any number of swaps, may be none)
        section = 'declarations'
        for s in self.kernel_ast.block_items:
            if section == 'declarations':
                if type(s) in [c_ast.Decl]:
                    declarations.append(s)
                    continue
                # anything not a Declaration terminates the declaration section
                else:
                    section = 'loopnest'
            if section == 'loopnest':
                # a single loop is expected, which may be preceded with Pragmas
                if type(s) is c_ast.Pragma:
                    loop_nest.append(s)
                    continue
                elif type(s) is c_ast.For:
                    loop_nest.append(s)
                    section = 'swaps'
                    continue
                else:
                    raise ValueError("Expected for loop or pragma(s), found {} instead.".format(s))
            if section == 'swaps':
                if type(s) is c_ast.FuncCall and s.name.name == 'swap':
                    swaps.append(s)
                    continue
                else:
                    raise ValueError("Beyond the for loop, only function calls of 'swap' may be "
                                     "placed, found {} instead.".format(s))
            else:
                raise ValueError("Malformed code, does not follow declaration-loopnest-swaps "
                                 "structure.")

        for item in declarations:
            array = type(item.type) is c_ast.ArrayDecl

            if array:
                dims = []
                t = item.type
                while type(t) is c_ast.ArrayDecl:
                    dims.append(self.conv_ast_to_sym(t.dim))
                    t = t.type

                self.set_variable(item.name, tuple(t.type.names), tuple(dims))

            else:
                self.set_variable(item.name, tuple(item.type.type.names), None)

        self._p_for(loop_nest[-1])
        self.swaps = swaps

    def conv_ast_to_sym(self, math_ast):
        """
        Convert mathematical expressions to a sympy representation.

        May only contain paranthesis, addition, subtraction and multiplication from AST.
        """
        if type(math_ast) is c_ast.ID:
            return symbol_pos_int(math_ast.name)
        elif type(math_ast) is c_ast.Constant:
            return sympy.Integer(math_ast.value)
        else:  # elif type(dim) is c_ast.BinaryOp:
            op = {
                '*': operator.mul,
                '+': operator.add,
                '-': operator.sub
            }

            return op[math_ast.op](
                self.conv_ast_to_sym(math_ast.left),
                self.conv_ast_to_sym(math_ast.right))

    def _get_offsets(self, aref, dim=0):
        """
        Return a tuple of offsets of an ArrayRef object in all dimensions.

        The index order is right to left (c-code order).
        e.g. c[i+1][j-2] -> (j-2, i+1)

        If aref is actually a c_ast.ID, None will be returned.
        """
        if isinstance(aref, c_ast.ID):
            return None

        # Check for restrictions
        assert type(aref.name) in [c_ast.ArrayRef, c_ast.ID], \
            "array references must only be used with variables or other array references"
        assert type(aref.subscript) in [c_ast.ID, c_ast.Constant, c_ast.BinaryOp], \
            'array subscript must only contain variables or binary operations'

        # Convert subscript to sympy and append
        idxs = [self.conv_ast_to_sym(aref.subscript)]

        # Check for more indices (multi-dimensional access)
        if type(aref.name) is c_ast.ArrayRef:
            idxs += self._get_offsets(aref.name, dim=dim+1)

        # Reverse to preserver order (the subscripts in the AST are traversed backwards)
        if dim == 0:
            idxs.reverse()

        return tuple(idxs)

    @classmethod
    def _get_basename(cls, aref):
        """
        Return base name of ArrayRef object.

        e.g. c[i+1][j-2] -> 'c'
        """
        if isinstance(aref.name, c_ast.ArrayRef):
            return cls._get_basename(aref.name)
        elif isinstance(aref.name, str):
            return aref.name
        else:
            return aref.name.name

    def _p_for(self, floop):
        # Check for restrictions
        assert type(floop) is c_ast.For, "May only be a for loop"
        assert hasattr(floop, 'init') and hasattr(floop, 'cond') and hasattr(floop, 'next'), \
            "Loop must have initial, condition and next statements."
        assert type(floop.init) is c_ast.DeclList, \
            "Initialization of loops need to be declarations."
        assert len(floop.init.decls) == 1, "Only single declaration is allowed in init. of loop."
        assert floop.cond.op in '<', "only lt (<) is allowed as loop condition"
        assert type(floop.cond.left) is c_ast.ID, 'left of cond. operand has to be a variable'
        assert type(floop.cond.right) in [c_ast.Constant, c_ast.ID, c_ast.BinaryOp], \
            'right of cond. operand has to be a constant, a variable or a binary operation'
        assert type(floop.next) in [c_ast.UnaryOp, c_ast.Assignment], \
            'next statement has to be a unary or assignment operation'
        assert floop.next.op in ['++', 'p++', '+='], 'only ++ and += next operations are allowed'
        assert type(floop.stmt) in [c_ast.Compound, c_ast.Assignment, c_ast.For], \
            'the inner loop may contain only assignments or compounds of assignments'

        if type(floop.cond.right) is c_ast.ID:
            const_name = floop.cond.right.name
            iter_max = symbol_pos_int(const_name)
        elif type(floop.cond.right) is c_ast.Constant:
            iter_max = sympy.Integer(floop.cond.right.value)
        else:  # type(floop.cond.right) is c_ast.BinaryOp
            bop = floop.cond.right
            assert bop.op in '+-*', ('only addition (+), substraction (-) and multiplications (*) '
                                     'are accepted operators')
            iter_max = self.conv_ast_to_sym(bop)

        iter_min = self.conv_ast_to_sym(floop.init.decls[0].init)

        if type(floop.next) is c_ast.Assignment:
            assert type(floop.next.lvalue) is c_ast.ID, \
                'next operation may only act on loop counter'
            assert type(floop.next.rvalue) is c_ast.Constant, 'only constant increments are allowed'
            assert floop.next.lvalue.name == floop.cond.left.name == floop.init.decls[0].name, \
                'initial, condition and next statement of for loop must act on same loop ' \
                'counter variable'
            step_size = int(floop.next.rvalue.value)
        else:
            assert type(floop.next.expr) is c_ast.ID, 'next operation may only act on loop counter'
            assert floop.next.expr.name == floop.cond.left.name == floop.init.decls[0].name, \
                'initial, condition and next statement of for loop must act on same loop ' \
                'counter variable'
            assert isinstance(floop.next, c_ast.UnaryOp), 'only assignment or unary operations ' \
                'are allowed for next statement of loop.'
            assert floop.next.op in ['++', 'p++', '--', 'p--'], 'Unary operation can only be ++ ' \
                'or -- in next statement'
            if floop.next.op in ['++', 'p++']:
                step_size = sympy.Integer('1')
            else:  # floop.next.op in ['--', 'p--']:
                step_size = sympy.Integer('-1')

        # Document for loop stack
        self._loop_stack.append(
            # (index name, min, max, step size)
            (floop.init.decls[0].name, iter_min, iter_max, step_size)
        )

        # Traverse tree
        if type(floop.stmt) is c_ast.For:
            self._p_for(floop.stmt)
        elif type(floop.stmt) is c_ast.Assignment:
            self._p_assignment(floop.stmt)
        # Handle For if it is the last statement, only preceeded by Pragmas
        elif type(floop.stmt.block_items[-1]) is c_ast.For and \
                all([type(s) == c_ast.Pragma for s in floop.stmt.block_items[:-1]]):
            self._p_for(floop.stmt.block_items[-1])
        else:  # type(floop.stmt) is c_ast.Compound
            # Handle Assignments
            for assgn in floop.stmt.block_items:
                # Ignore pragmas
                if type(assgn) is c_ast.Pragma:
                    continue
                elif type(assgn) is c_ast.Assignment:
                    self._p_assignment(assgn)
                else:
                    raise ValueError("Assignments are only allowed in inner most loop.")

    def _p_assignment(self, stmt):
        # Check for restrictions
        assert type(stmt) is c_ast.Assignment, \
            "Only assignment and pragma statements are allowed in loops."
        assert type(stmt.lvalue) in [c_ast.ArrayRef, c_ast.ID], \
            "Only assignment to array element or varialbe is allowed."

        write_and_read = False
        if stmt.op != '=':
            write_and_read = True
            op = stmt.op.strip('=')
            self._flops[op] = self._flops.get(op, 0)+1

        # Document data destination
        # self.destinations[dest name] = [dest offset, ...])
        self.destinations.setdefault(self._get_basename(stmt.lvalue), set())
        self.destinations[self._get_basename(stmt.lvalue)].add(
            self._get_offsets(stmt.lvalue))

        if write_and_read:
            # this means that +=, -= or something of that sort was used
            self.sources.setdefault(self._get_basename(stmt.lvalue), set())
            self.sources[self._get_basename(stmt.lvalue)].add(
                self._get_offsets(stmt.lvalue))

        # Traverse tree
        self._p_sources(stmt.rvalue)

    def _p_sources(self, stmt):
        sources = []
        assert type(stmt) in \
            [c_ast.ArrayRef, c_ast.Constant, c_ast.ID, c_ast.BinaryOp, c_ast.UnaryOp], \
            'only references to arrays, constants and variables as well as binary operations ' + \
            'are supported'
        assert type(stmt) is not c_ast.UnaryOp or stmt.op in ['-', '--', '++', 'p++', 'p--'], \
            'unary operations are only allowed with -, -- and ++'

        if type(stmt) in [c_ast.ArrayRef, c_ast.ID]:
            # Document data source
            bname = self._get_basename(stmt)
            self.sources.setdefault(bname, set())
            self.sources[bname].add(self._get_offsets(stmt))
        elif type(stmt) is c_ast.BinaryOp:
            # Traverse tree
            self._p_sources(stmt.left)
            self._p_sources(stmt.right)

            self._flops[stmt.op] = self._flops.get(stmt.op, 0)+1
        elif type(stmt) is c_ast.UnaryOp:
            self._p_sources(stmt.expr)
            self._flops[stmt.op] = self._flops.get(stmt.op[-1], 0)+1

        return sources

    def get_index_type(self, loop_nest=None):
        """
        Return index type used in loop nest.

        If index type between loops differ, an exception is raised.
        """
        if loop_nest is None:
            loop_nest = self.get_kernel_loop_nest()
        if type(loop_nest) is c_ast.For:
            loop_nest = [loop_nest]
        index_types = (None, None)
        for s in loop_nest:
            if type(s) is c_ast.For:
                if type(s.stmt) in [c_ast.For, c_ast.Compound]:
                    other = self.get_index_type(loop_nest=s.stmt)
                else:
                    other = None
                index_types = (s.init.decls[0].type.type.names, other)
                break
        if index_types[0] == index_types[1] or index_types[1] is None:
            return index_types[0]
        else:
            raise ValueError("Loop indices must have same type, found {}.".format(index_types))

    def _build_const_declartions(self, with_init=True):
        """
        Generate constants declarations

        :return: list of declarations
        """
        decls = []

        # Use type as provided by user in loop indices
        index_type = self.get_index_type()

        i = 2  # subscript for cli input, 1 is reserved for repeat
        for k in self.constants:
            # const long long N = strtoul(argv[2])
            # with increasing N and 1
            # TODO change subscript of argv depending on constant count
            type_decl = c_ast.TypeDecl(k.name, ['const'], c_ast.IdentifierType(index_type))
            init = None
            if with_init:
                init = c_ast.FuncCall(
                    c_ast.ID('atoi'),
                    c_ast.ExprList([c_ast.ArrayRef(c_ast.ID('argv'),
                                                   c_ast.Constant('int', str(i)))]))
            i += 1
            decls.append(c_ast.Decl(
                k.name, ['const'], [], [],
                type_decl, init, None))

        return decls

    def get_array_declarations(self):
        """Return array declarations."""
        return [d for d in self.kernel_ast.block_items
                if type(d) is c_ast.Decl and type(d.type) is c_ast.ArrayDecl]

    def get_kernel_loop_nest(self):
        """
        Return kernel loop nest, with openmp pragmas insert unless already present.
        """
        loop_nest = [s for s in self.kernel_ast.block_items
                     if type(s) in [c_ast.For, c_ast.Pragma, c_ast.FuncCall]]
        assert len(loop_nest) >= 1, "Found to few for statements in kernel"

        omp_pragmas = [p for p in find_node_type(loop_nest, c_ast.Pragma)
                        if 'omp' in p.string]
        # if omp pragmas were found: replace "parallel for" -> "for"
        for op in omp_pragmas:
            op.string = op.string.replace(' parallel', '')
        # if no omp for pragmas are present, insert suitable one at start of outer loop
        if not omp_pragmas:
            loop_nest.insert(0, c_ast.Pragma("omp for"))

        return loop_nest

    def _build_array_declarations(self, with_init=True):
        """
        Generate declaration statements for arrays.

        Also transforming multi-dim to 1d arrays and initializing with malloc.

        :param with_init: ommit malloc initialization

        :return: list of declarations nodes, dictionary of array names and original dimensions
        """
        # copy array declarations from from kernel ast
        array_declarations = deepcopy(self.get_array_declarations())
        array_dict = []
        for d in array_declarations:
            # We need to transform
            array_dict.append(transform_multidim_to_1d_decl(d))
            transform_array_decl_to_malloc(d, with_init=with_init)
        return array_declarations, dict(array_dict)

    def _find_inner_most_loop(self, loop_nest):
        """Return inner most for loop in loop nest"""
        r = None
        for s in loop_nest:
            if type(s) is c_ast.For:
                return self._find_inner_most_loop(s) or s
            else:
                r = r or self._find_inner_most_loop(s)
        return r

    def _build_array_initializations(self, array_dimensions):
        """
        Generate initialization statements for arrays.

        :param array_dimensions: dictionary of array dimensions

        :return: list of nodes
        """
        with set_recursionlimit(100000):
            kernel = deepcopy(self.get_kernel_loop_nest())
        # traverse to the inner most for loop:
        inner_most = self._find_inner_most_loop(kernel)
        orig_inner_stmt = inner_most.stmt
        inner_most.stmt = c_ast.Compound([])

        rand_float_str = str(random.uniform(1.0, 0.1))

        # find all array references in original orig_inner_stmt
        for aref in find_node_type(orig_inner_stmt, c_ast.ArrayRef):
            # transform to 1d references
            transform_multidim_to_1d_ref(aref, array_dimensions)
            # build static assignments and inject into inner_most.stmt
            inner_most.stmt.block_items.append(c_ast.Assignment(
                '=', aref, c_ast.Constant('float', rand_float_str)))

        return kernel

    def _build_dummy_calls(self):
        """
        Generate false if branch with dummy calls

        Requires kerncraft.h to be included, which defines dummy(...) and var_false.

        :return: dummy statement
        """
        # Make sure nothing gets removed by inserting dummy calls
        dummy_calls = []
        for d in self.kernel_ast.block_items:
            # Only consider toplevel declarations from kernel ast
            if type(d) is not c_ast.Decl: continue
            if type(d.type) is c_ast.ArrayDecl:
                dummy_calls.append(c_ast.FuncCall(
                    c_ast.ID('dummy'),
                    c_ast.ExprList([c_ast.ID(d.name)])))
            else:
                dummy_calls.append(c_ast.FuncCall(
                    c_ast.ID('dummy'),
                    c_ast.ExprList([c_ast.UnaryOp('&', c_ast.ID(d.name))])))
        dummy_stmt = c_ast.If(
            cond=c_ast.ID('var_false'),
            iftrue=c_ast.Compound(dummy_calls),
            iffalse=None)
        return dummy_stmt

    def _build_kernel_function_declaration(self, name='kernel'):
        """Build and return kernel function declaration"""
        array_declarations, array_dimensions = self._build_array_declarations(with_init=False)
        const_declarations = self._build_const_declartions(with_init=False)
        return c_ast.FuncDecl(args=c_ast.ParamList(params=array_declarations +
                                                          const_declarations),
                              type=c_ast.TypeDecl(declname=name,
                                                  quals=[],
                                                  type=c_ast.IdentifierType(names=['void'])))

    def get_scalar_declarations(self):
        """Get all scalar declarations."""
        return [d for d in self.kernel_ast.block_items
                if type(d) is c_ast.Decl and type(d.type) is c_ast.TypeDecl]

    def _build_scalar_extern_declarations(self):
        """Build and return scalar variable declarations, with extern attribute."""
        scalar_declarations = deepcopy(self.get_scalar_declarations())
        for d in scalar_declarations:
            d.storage = ['extern']
        return scalar_declarations

    def _build_scalar_initializations(self):
        """Build and return scalar variable initialization."""
        random.seed(2342)  # we want reproducible random numbers
        scalar_inits = []
        for d in self.get_scalar_declarations():
            if d.type.type.names[0] in ['double', 'float']:
                init_const = c_ast.Constant('float', str(random.uniform(1.0, 0.1)))
            elif d.type.type.names[0] in ['int', 'long', 'long long',
                                          'unsigned int', 'unsigned long', 'unsigned long long']:
                    init_const = c_ast.Constant('int', 2)

            scalar_inits.append(c_ast.Assignment(
                '=', c_ast.ID(d.name), init_const))

        return scalar_inits

    def get_kernel_header(self, name='kernel'):
        """
        Generate and store kernel.h

        :return: tuple of filename of header and file pointer of lockfile
        """
        file_name = 'kernel.h'
        file_path = self.get_intermediate_location(
            file_name, machine_and_compiler_dependent=False)
        lock_mode, lock_fp = self.lock_intermediate(file_path)
        if lock_mode == fcntl.LOCK_SH:
            # use cache
            with open(file_path) as f:
                code = f.read()
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            func_decl = self._build_kernel_function_declaration(name=name)
            scalar_decls = self.get_scalar_declarations()
            code = CGenerator().visit(
                c_ast.FileAST(ext=[func_decl]+scalar_decls))
            with open(file_path, 'w') as f:
                f.write(code)
            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock

        return file_name, lock_fp

    def get_kernel_code(self, openmp=False, name='kernel'):
        """
        Generate and return compilable source code with kernel function from AST.

        :param openmp: include openmp paragmas (or strip them)
        :param name: name of kernel function
        """
        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
                                            "based on a kernel description rather than code."
        filename = 'kernel'
        if openmp:
            filename += '-omp'
        filename += '.c'
        file_path = self.get_intermediate_location(
            filename, machine_and_compiler_dependent=False)
        lock_mode, lock_fp = self.lock_intermediate(file_path)

        if lock_mode == fcntl.LOCK_SH:
            # use cache
            with open(file_path) as f:
                code = f.read()
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            array_declarations, array_dimensions = self._build_array_declarations()

            # Prepare actual kernel loop nest
            with set_recursionlimit(100000):
                kernel = deepcopy(self.get_kernel_loop_nest())
            # find all array references in kernel
            for aref in find_node_type(kernel, c_ast.ArrayRef):
                # transform to 1d references
                transform_multidim_to_1d_ref(aref, array_dimensions)

            function_ast = c_ast.FuncDef(decl=c_ast.Decl(
                name=name, type=self._build_kernel_function_declaration(name=name), quals=[],
                storage=[], funcspec=[], init=None, bitsize=None),
                body=c_ast.Compound(block_items=kernel),
                param_decls=None)

            # Generate code
            with set_recursionlimit(100000):
                code = CGenerator().visit(function_ast)
            
            if not openmp:
                # remove all omp pragmas
                code = re.sub('#pragma omp[^\n]*\n', '', code)

            # Insert missing #includes from template to top of code
            code = '#include "kerncraft.h"\n#include "kernel.h"\n\n' + code

            # Store to file
            with open(file_path, 'w') as f:
                f.write(code)
            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock

        return file_path, lock_fp

    def _build_kernel_call(self, name='kernel'):
        """Generate and return kernel call ast."""
        return c_ast.FuncCall(name=c_ast.ID(name=name), args=c_ast.ExprList(exprs=(
            [c_ast.ID(name=d.name) for d in self._build_array_declarations()[0]] +
            [c_ast.ID(name=d.name) for d in self._build_const_declartions()])))

    CODE_TEMPLATE = textwrap.dedent("""
        #include <likwid.h>
        #include <stdlib.h>
        #include "kerncraft.h"
        #include "kernel.h"

        void dummy(void *);
        extern int var_false;

        int main(int argc, char **argv) {
          // Declaring constants
          DECLARE_CONSTS;
          // Declaring arrays
          DECLARE_ARRAYS;
          // Initializing scalars
          INIT_SCALARS;

          likwid_markerInit();
          #pragma omp parallel
          {
            likwid_markerRegisterRegion("loop");
            #pragma omp barrier

            // Initializing arrays in same order as touched in kernel loop nest
            INIT_ARRAYS;

            // Dummy call
            DUMMY_CALLS;

            for(int warmup = 1; warmup >= 0; --warmup) {
              int repeat = 2;
              if(warmup == 0) {
                repeat = atoi(argv[1]);
                likwid_markerStartRegion("loop");
              }

              for(; repeat > 0; --repeat) {
                KERNEL_CALL;
                DUMMY_CALLS;
              }

            }
            likwid_markerStopRegion("loop");
          }
          likwid_markerClose();
        }
        """)

    def get_main_code(self, kernel_function_name='kernel'):
        """
        Generate and return compilable source code from AST.

        :return: tuple of filename and shared lock file pointer
        """
        # TODO produce nicer code, including help text and other "comfort features".
        assert self.kernel_ast is not None, "AST does not exist, this could be due to running " \
                                            "based on a kernel description rather than code."

        file_path = self.get_intermediate_location('main.c', machine_and_compiler_dependent=False)
        lock_mode, lock_fp = self.lock_intermediate(file_path)

        if lock_mode == fcntl.LOCK_SH:
            # use cache
            with open(file_path) as f:
                code = f.read()
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            parser = CParser()
            template_code = self.CODE_TEMPLATE
            template_ast = parser.parse(clean_code(template_code,
                                                   macros=True, comments=True, pragmas=False))
            ast = deepcopy(template_ast)

            # Define and replace DECLARE_CONSTS
            replace_id(ast, "DECLARE_CONSTS", self._build_const_declartions(with_init=True))

            # Define and replace DECLARE_ARRAYS
            array_declarations, array_dimensions = self._build_array_declarations()
            replace_id(ast, "DECLARE_ARRAYS", array_declarations)

            # Define and replace DECLARE_INIT_SCALARS
            replace_id(ast, "INIT_SCALARS", self._build_scalar_initializations())

            # Define and replace DUMMY_CALLS
            replace_id(ast, "DUMMY_CALLS", self._build_dummy_calls())

            # Define and insert kernel declaration at top
            #ast.ext.insert(0, self._build_kernel_function_declaration(
            #    name=kernel_function_name))
            ast.ext[:0] = self._build_scalar_extern_declarations()

            # Define and replace KERNEL_CALL
            replace_id(ast, "KERNEL_CALL", self._build_kernel_call())

            # Define and replace INIT_ARRAYS based on previously generated kernel
            replace_id(ast, "INIT_ARRAYS", self._build_array_initializations(array_dimensions))

            # Generate code
            code = CGenerator().visit(ast)

            # Insert missing #includes from template to top of code
            code = '\n'.join([l for l in template_code.split('\n') if l.startswith("#include")]) + \
                   '\n\n' + code

            # Store to file
            with open(file_path, 'w') as f:
                f.write(code)
            fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock

        return file_path, lock_fp


    def assemble_to_object(self, in_filename, verbose=False):
        """
        Assemble *in_filename* assembly into *out_filename* object.

        Returns tuple of filename to binary file and shared lock file pointer
        """
        # Build file name
        file_base_name = os.path.splitext(os.path.basename(in_filename))[0]
        out_filename = self.get_intermediate_location(file_base_name + '.o')
        lock_mode, lock_fp = self.lock_intermediate(out_filename)

        if lock_mode == fcntl.LOCK_SH:
            # use cached version
            pass
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            compiler, compiler_args = self._machine.get_compiler()

            # Compile to object file
            compiler_args.append('-c')

            cmd = [compiler] + [in_filename] + compiler_args + ['-o', out_filename]

            if verbose:
                print('Executing (assemble_to_object): ', ' '.join(cmd))

            try:
                # Assemble all to a binary
                subprocess.check_output(cmd)
                fcntl.flock(lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
            except subprocess.CalledProcessError as e:
                print("Assembly failed:", e, file=sys.stderr)
                sys.exit(1)

        return out_filename, lock_fp

    def compile_kernel(self, assembly=False, openmp=False, verbose=False):
        """
        Compile source (from as_code(type_)) to assembly or object and return (fileptr, filename).

        :return: tuple of filename of compiled kernel and shared lock file pointer
        """
        compiler, compiler_args = self._machine.get_compiler()

        filename = 'kernel'
        if openmp:
            filename += '-omp'
        if assembly:
            compiler_args += ['-S']
            filename += '.s'
        else:
            filename += '.o'
        out_filename = self.get_intermediate_location(filename)
        lock_mode, out_lock_fp = self.lock_intermediate(out_filename)

        if lock_mode == fcntl.LOCK_SH:
            # use cached version
            if verbose:
                print('Executing (compile_kernel): ', 'using cached', out_filename)
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            compiler_args += ['-std=c99']

            header_filename, header_lock_fp = self.get_kernel_header()
            in_filename, in_lock_fp = self.get_kernel_code(openmp=openmp)
            cmd = ([compiler] +
                   [in_filename,
                    '-c',
                    '-I'+reduce_path(os.path.abspath(os.path.dirname(
                        os.path.realpath(__file__)))+'/headers/'),
                    '-o', out_filename] +
                   compiler_args)

            if verbose:
                print('Executing (compile_kernel): ', ' '.join(cmd))

            try:
                subprocess.check_output(cmd)
                in_lock_fp.close()
                header_lock_fp.close()

            except subprocess.CalledProcessError as e:
                print("Compilation failed:", e, file=sys.stderr)
                sys.exit(1)

            # FIXME TODO FIXME TODO FIXME TODO
            # Hacky workaround for icc issue (icc may issue vkmovb instructions with AVX512, which 
            # are invalid and should be kmovb):
            if compiler == 'icc' and assembly:
                with open(out_filename, 'r+') as f:
                    assembly = f.read()
                    f.seek(0)
                    f.write(assembly.replace('vkmovb', 'kmovb'))
                    f.truncate()
            # FIXME TODO FIXME TODO FIXME TODO

            fcntl.flock(out_lock_fp, fcntl.LOCK_SH)  # degrade to shared lock

        return out_filename, out_lock_fp

    def incore_analysis(self, asm_block='auto', pointer_increment='auto_with_manual_fallback',
                        model=None, verbose=False):
        """
        Run an in-core analysis and return its outcome.

        :param asm_block: controls how the to-be-marked block is chosen. "auto" (default) results in
                          the largest block, "manual" results in interactive and a number in the
                          according block.
        :param pointer_increment: number of bytes the pointer is incremented after the loop or
                                   - 'auto': automatic detection, RuntimeError raised if failed
                                   - 'auto_with_manual_fallback': like auto, fallsback to manual input
                                   - 'manual': prompt user
        :param model: which model to use, "IACA", "OSACA" or "LLVM-MCA"
        """
        # Get model and parameter
        if model is None:
            model = next(iter(self._machine['in-core model']))
        model_parameter = self._machine['in-core model'][model]

        analysis_filename = self.get_intermediate_location('incore_analysis.pickle.lzma',
            other_dependencies=[model, str(model_parameter)])
        analysis_lock_mode, analysis_lock_fp = self.lock_intermediate(analysis_filename)
        if analysis_lock_mode == fcntl.LOCK_SH:
            # use cached analysis
            analysis, self.pointer_increment = compress_pickle.load(analysis_filename)
            analysis_lock_fp.close()  # release lock
            return analysis, self.pointer_increment
        
        marked_filename = self.get_intermediate_location('kernel-marked.s',
            other_dependencies=[asm_block, pointer_increment])
        lock_mode, marked_lock_fp = self.lock_intermediate(marked_filename)
        if lock_mode == fcntl.LOCK_SH:
            # use cached maked assembly and extract asm_block and pointer_increment
            with open(marked_filename) as f:
                marked_asm = f.read()
            m = re.search(r'pointer_increment=([0-9]+)', marked_asm)
            if m:
                self.pointer_increment = int(m.group(1))
            else:
                print("Could not find `pointer_increment=<byte increment>`. Plase place into file.")
                sys.exit(1)
        else:
            # marked assembly needs update
            asm_filename, asm_lock_fp = self.compile_kernel(assembly=True, verbose=verbose)
            with open(asm_filename, 'r') as in_file, open(marked_filename, 'w') as out_file:
                asm_block, self.pointer_increment = incore_model.asm_instrumentation(
                    in_file, out_file,
                    block_selection=asm_block,
                    pointer_increment=pointer_increment,
                    isa=self._machine['isa'])
            asm_lock_fp.close()
            fcntl.flock(marked_lock_fp, fcntl.LOCK_SH)  # degrade to shared lock

        # analysis_lock_mode == fcntl.LOCK_EX
        if model == 'OSACA':
            analysis = incore_model.osaca_analyse_instrumented_assembly(
                marked_filename, model_parameter)
        elif model == 'LLVM-MCA':
            analysis = incore_model.llvm_mca_analyse_instrumented_assembly(
                marked_filename, model_parameter)
        elif model == 'IACA':
            obj_name, obj_lock_fp = self.assemble_to_object(marked_filename, verbose=verbose)
            analysis = incore_model.iaca_analyse_instrumented_binary(obj_name, model_parameter)
            obj_lock_fp.close()
        else:
            raise ValueError("Unknown micro-architecture model: {!r}".format(model))
        compress_pickle.dump((analysis, self.pointer_increment), analysis_filename)
        analysis_lock_fp.close()
        marked_lock_fp.close()
        return analysis, self.pointer_increment

    def build_executable(self, lflags=None, verbose=False, openmp=False):
        """Compile source to executable with likwid capabilities and return the executable name."""
        compiler, compiler_args = self._machine.get_compiler()

        filename = 'kernel'
        if openmp:
            filename += '-omp'
        out_filename = self.get_intermediate_location(filename)
        lock_mode, out_lock_fp = self.lock_intermediate(out_filename)

        if lock_mode == fcntl.LOCK_SH:
            # use cached version
            if verbose:
                print('Executing (build_executable): ', 'using cached', out_filename)
        else:  # lock_mode == fcntl.LOCK_EX
            # needs update
            if not (('LIKWID_INCLUDE' in os.environ or 'LIKWID_INC' in os.environ) and
                    'LIKWID_LIB' in os.environ):
                print('Could not find LIKWID_INCLUDE (e.g., "-I/app/likwid/4.1.2/include") and '
                      'LIKWID_LIB (e.g., "-L/apps/likwid/4.1.2/lib") environment variables',
                      file=sys.stderr)
                sys.exit(1)

            compiler_args += [
                '-std=c99',
                '-I'+reduce_path(os.path.abspath(os.path.dirname(
                    os.path.realpath(__file__)))+'/headers/'),
                os.environ.get('LIKWID_INCLUDE', ''),
                os.environ.get('LIKWID_INC', ''),
                '-llikwid']

            # This is a special case for unittesting
            if os.environ.get('LIKWID_LIB') == '':
                compiler_args = compiler_args[:-1]

            if lflags is None:
                lflags = []
            lflags += os.environ['LIKWID_LIB'].split(' ') + ['-pthread']
            compiler_args += os.environ['LIKWID_LIB'].split(' ') + ['-pthread']

            main_filename, main_lock_fp = self.get_main_code()
            kernel_obj_filename, kernel_obj_lock_fp = self.compile_kernel(
                openmp=openmp, verbose=verbose)

            infiles = [reduce_path(os.path.abspath(os.path.dirname(
                os.path.realpath(__file__)))+'/headers/dummy.c'),
                       kernel_obj_filename, main_filename]

            cmd = [compiler] + infiles + compiler_args + ['-o', out_filename]
            # remove empty arguments
            cmd = list(filter(bool, cmd))
            if verbose:
                print('Executing (build_executable): ', ' '.join(cmd))
            try:
                subprocess.check_output(cmd)
                main_lock_fp.close()
                kernel_obj_lock_fp.close()
                fcntl.flock(out_lock_fp, fcntl.LOCK_SH)  # degrade to shared lock
            except subprocess.CalledProcessError as e:
                print("Build failed:", e, file=sys.stderr)
                sys.exit(1)

        return out_filename, out_lock_fp


class KernelDescription(Kernel):
    """
    Kernel information gathered from YAML kernel description file.

    This class does NOT allow compilation, required for IACA analysis (ECMCPU and RooflineIACA)
    and LIKWID benchmarking (benchmark).
    """

    def incore_analysis(self, *args, **kwargs):
        raise NotImplementedError("IACA analysis is not possible based on a Kernel Description")

    def build_executable(self, *args, **kwargs):
        raise NotImplementedError("Building and compilation is not possible based on a Kernel "
                                  "Description")

    def __init__(self, description, machine=None):
        """
        Create kernel representation from a description dictionary.

        :param description: must have a dictionary like interface (e.g., a YAML object).
        """
        super(KernelDescription, self).__init__(machine=machine)

        # Loops
        self._loop_stack = list([
            (l['index'], string_to_sympy(l['start']),
             string_to_sympy(l['stop']), string_to_sympy(l['step']))
            for l in description['loops']
        ])

        # Variables
        for var_name, v in description['arrays'].items():
            self.set_variable(var_name, v['type'], string_to_sympy(v['dimension']))

        # Datatype
        self.datatype = list(self.variables.values())[0][0]

        # Data sources
        self.sources = {
            var_name: set([string_to_sympy(idx) for idx in v])
            for var_name, v in description['data sources'].items()
        }

        # Data destinations
        self.destinations = {
            var_name: set([string_to_sympy(idx) for idx in v])
            for var_name, v in description['data destinations'].items()
        }

        # Flops
        self._flops = description['flops']

        self.check()