import numpy, math, os import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule from reikna.cluda import dtypes, functions from reikna.core import Computation, Transformation, Parameter, Annotation, Type from reikna.fft import FFT from reikna.algorithms import Transpose import reikna.transformations as transformations def hanning_window(arr, NFFT): """ Applies the von Hann window to the rows of a 2D array. To account for zero padding (which we do not want to window), NFFT is provided separately. """ if dtypes.is_complex(arr.dtype): coeff_dtype = dtypes.real_for(arr.dtype) else: coeff_dtype = arr.dtype return Transformation( [ Parameter('output', Annotation(arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ ${dtypes.ctype(coeff_dtype)} coeff; %if NFFT != output.shape[0]: if (${idxs[1]} >= ${NFFT}) { coeff = 1; } else %endif { coeff = 0.5 * (1 - cos(2 * ${numpy.pi} * ${idxs[-1]} / (${NFFT} - 1))); } ${output.store_same}(${mul}(${input.load_same}, coeff)); """, render_kwds=dict( coeff_dtype=coeff_dtype, NFFT=NFFT, mul=functions.mul(arr.dtype, coeff_dtype))) def rolling_frame(arr, NFFT, noverlap, pad_to): """ Transforms a 1D array to a 2D array whose rows are partially overlapped parts of the initial array. """ frame_step = NFFT - noverlap frame_num = (arr.size - noverlap) // frame_step frame_size = NFFT if pad_to is None else pad_to result_arr = Type(arr.dtype, (frame_num, frame_size)) return Transformation( [ Parameter('output', Annotation(result_arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ %if NFFT != output.shape[1]: if (${idxs[1]} >= ${NFFT}) { ${output.store_same}(0); } else %endif { ${output.store_same}(${input.load_idx}(${idxs[0]} * ${frame_step} + ${idxs[1]})); } """, render_kwds=dict(frame_step=frame_step, NFFT=NFFT), # note that only the "store_same"-using argument can serve as a connector! connectors=['output']) def crop_frequencies(arr): """ Crop a 2D array whose columns represent frequencies to only leave the frequencies with different absolute values. """ result_arr = Type(arr.dtype, (arr.shape[0], arr.shape[1] // 2 + 1)) return Transformation( [ Parameter('output', Annotation(result_arr, 'o')), Parameter('input', Annotation(arr, 'i')), ], """ if (${idxs[1]} < ${input.shape[1] // 2 + 1}) ${output.store_idx}(${idxs[0]}, ${idxs[1]}, ${input.load_same}); """, # note that only the "load_same"-using argument can serve as a connector! connectors=['input']) def getGridSize(blockDim, arrayDim): blockSize = 1 arraySize = 1 for dim in blockDim: blockSize *= dim for dim in arrayDim: arraySize *= dim grid1dim = int(math.ceil(math.sqrt(arraySize/blockSize))) if grid1dim < 1: # Prevent issue where we have a small array size grid1dim = 1 return (grid1dim, grid1dim) def maximum_filter_2d(arr2D, footprint): ## Make sure arr2D is our datatype float32 and footprint of int32 arr2DMaxed = numpy.empty_like(arr2D) head, tail = os.path.split(os.path.abspath(__file__)) # Used so that we can always get the kernel which should be in the same directory as this file maxFunction = open(head + "/2DSlidingMaxFootprintKernel.c", "rt") maxFunction = SourceModule(maxFunction.read()) slidingMaxKernel = maxFunction.get_function("slidingMaxiumum2D") blockSize = [16, 16] # To-do: Add a variable to this, can affect performance based on GPU gridSize = getGridSize(blockSize, arr2D.shape) # Get the size of our grid based on the size of a grid (blocksize) slidingMaxKernel(cuda.In(arr2D), # Input cuda.Out(arr2DMaxed), # Output numpy.int32(footprint.shape[1]), # Kernel Size numpy.int32(arr2D.shape[1]), # Row Stride numpy.int32(1), # Column Stride numpy.int32(int(arr2D.shape[1])), # Array Column Count numpy.int32(int(arr2D.shape[0])), # Array Row Count cuda.In(footprint), block=(blockSize[0],blockSize[1],1), grid=(gridSize[0],gridSize[1],1) ) return arr2DMaxed class Spectrogram(Computation): def __init__(self, x, NFFT=256, noverlap=128, pad_to=None, window=hanning_window): # print("x Data type = %s" % x.dtype) # print("Is Real = %s" % dtypes.is_real(x.dtype)) # print("dim = %s" % x.ndim) assert dtypes.is_real(x.dtype) assert x.ndim == 1 rolling_frame_trf = rolling_frame(x, NFFT, noverlap, pad_to) complex_dtype = dtypes.complex_for(x.dtype) fft_arr = Type(complex_dtype, rolling_frame_trf.output.shape) real_fft_arr = Type(x.dtype, rolling_frame_trf.output.shape) window_trf = window(real_fft_arr, NFFT) broadcast_zero_trf = transformations.broadcast_const(real_fft_arr, 0) to_complex_trf = transformations.combine_complex(fft_arr) amplitude_trf = transformations.norm_const(fft_arr, 1) crop_trf = crop_frequencies(amplitude_trf.output) fft = FFT(fft_arr, axes=(1,)) fft.parameter.input.connect( to_complex_trf, to_complex_trf.output, input_real=to_complex_trf.real, input_imag=to_complex_trf.imag) fft.parameter.input_imag.connect( broadcast_zero_trf, broadcast_zero_trf.output) fft.parameter.input_real.connect( window_trf, window_trf.output, unwindowed_input=window_trf.input) fft.parameter.unwindowed_input.connect( rolling_frame_trf, rolling_frame_trf.output, flat_input=rolling_frame_trf.input) fft.parameter.output.connect( amplitude_trf, amplitude_trf.input, amplitude=amplitude_trf.output) fft.parameter.amplitude.connect( crop_trf, crop_trf.input, cropped_amplitude=crop_trf.output) self._fft = fft self._transpose = Transpose(fft.parameter.cropped_amplitude) Computation.__init__(self, [Parameter('output', Annotation(self._transpose.parameter.output, 'o')), Parameter('input', Annotation(fft.parameter.flat_input, 'i'))]) def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() temp = plan.temp_array_like(self._fft.parameter.cropped_amplitude) plan.computation_call(self._fft, temp, input_) plan.computation_call(self._transpose, output, temp) return plan