from urllib2 import urlopen as wget import urllib import datetime import os import stat from pprint import pprint import json from decimal import Decimal import boto3 import time from threading import Thread import botocore from botocore.client import ClientError from boto3.dynamodb.conditions import Key, Attr import uuid from aws_xray_sdk.core import xray_recorder from aws_xray_sdk.core import patch_all import sys import datetime import math # patch_all() XRAY = 'true' ## these are unique and must be set S3_BUCKET = "not-set" ## hardcoded for console use # DYNAMO_MAIN = "catfinder5002-main" DYNAMO_MAIN_GSI = "id_type-id_filename-index" FFPROBE = './ffprobe' FFMPEG = './ffmpeg' SOX = './sox' MASTER_TTL = 3600 s3 = boto3.resource('s3') dynamodb = boto3.resource('dynamodb') transcribe = boto3.client('transcribe', region_name='us-east-1') translate = boto3.client('translate') comprehend = boto3.client('comprehend') def get_environment_variables(): global S3_BUCKET global DYNAMO_MAIN if os.environ.get('S3_BUCKET') is not None: S3_BUCKET = os.environ['S3_BUCKET'] print('environment variable S3_BUCKET was found: {}'.format(S3_BUCKET)) if os.environ.get('DYNAMO_MAIN') is not None: DYNAMO_MAIN = os.environ['DYNAMO_MAIN'] print('environment variable DYNAMO_MAIN was found: {}'.format(DYNAMO_MAIN)) # Helper class to convert a DynamoDB item to JSON. class DecimalEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, Decimal): return str(o) return super(DecimalEncoder, self).default(o) def get_url(url, time = 20): try: output = wget(url, timeout = time).read() except urllib2.HTTPError, e: print(e.code) error_message = e.code print(error_message ) except urllib2.URLError, e: print(e.args) error_message = e.args print( error_message ) else: return output def ensure_dir(file_path): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) def save_file(name, data): f = open(name, 'w+') f.write(data) f.close() def delete_file(myfile): if os.path.isfile(myfile): os.remove(myfile) # print("Success: %s file was deleted" % myfile) else: ## Show an error ## print("Error: %s file not found" % myfile) def get_s3file(BUCKET_NAME, KEY, LOCALFILE): try: s3.Bucket(BUCKET_NAME).download_file(KEY, LOCALFILE) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist. 404") else: raise def update_dyanmo_main ( dynamo_object ): ## DynamoDB Update print("put_dynamo to table: " + str(DYNAMO_MAIN)) print("dynamo_object: {}".format(dynamo_object)) table = dynamodb.Table(DYNAMO_MAIN) response = table.update_item( Key={ 'id_filename': dynamo_object['id_filename'], }, UpdateExpression="set timestamp_updated = :timestamp_updated, transcribe_status = :transcribe_status, transcribe_transcript = :transcribe_transcript ", ExpressionAttributeValues={ ':timestamp_updated': int((datetime.datetime.utcnow() - datetime.datetime(1970,1,1)).total_seconds()), ':transcribe_status': dynamo_object['transcribe_status'], ':transcribe_transcript': dynamo_object['transcribe_transcript'], }, ReturnValues="UPDATED_NEW" ) # print("dynamo update_item succeeded: {}".format(response)) print("dynamo update_item succeeded") # pprint(response) def put_dynamo_main(dynamo_object): print("put_dynamo to table: " + str(DYNAMO_MAIN)) print("dynamo_object: {}".format(dynamo_object)) table = dynamodb.Table(DYNAMO_MAIN) try: response = table.put_item( Item=dynamo_object, ConditionExpression='attribute_not_exists(id_filename)' ) print("dynamo put_item succeeded: {}".format(response)) except Exception as e: # Ignore the ConditionalCheckFailedException, bubble up other exceptions. print("pizzaninja: {}".format(e)) print('broken dynamo: {}'.format(dynamo_object)) if e.response['Error']['Code'] != 'ConditionalCheckFailedException': raise e sys.exc_clear() def strip (data): if not isinstance(data, dict) and not isinstance(data, list) and not isinstance(data, str) and not isinstance(data, unicode): return False return len(data) == 0 def stripper (data): if isinstance(data, dict): return {k: stripper(v) for k, v in data.iteritems() if not strip(v)} elif isinstance(data, list): return [stripper(v) for v in data if not strip(v)] return data def transcribe_results(): ##### Transcribe Status updates print("query of table: " + str(DYNAMO_MAIN)) table = dynamodb.Table(DYNAMO_MAIN) response = table.query( Limit=10, IndexName=DYNAMO_MAIN_GSI, ScanIndexForward=False, KeyConditionExpression=Key('id_type').eq('minute'), ) framerate = '30.0' for item in response['Items']: # print('dynamo transcribe_status status: {} \t transcribe_transcript: {}'.format(item['transcribe_status'], item['transcribe_transcript'])) if item['transcribe_status'] == 'IN_PROGRESS': transcribe_job = transcribe.get_transcription_job(TranscriptionJobName=item['transcribe_job']) print('transcribe status: {}'.format(transcribe_job['TranscriptionJob']['TranscriptionJobStatus'])) transcribe_status = transcribe_job['TranscriptionJob']['TranscriptionJobStatus'] update_dyanmo_main( {'id_filename' : item['id_filename'], 'transcribe_status' : transcribe_status, 'transcribe_transcript' : 'IN_PROGRESS' } ) # if item['transcribe_status'] == 'COMPLETED' and item['transcribe_transcript'] == 'IN_PROGRESS': if item['transcribe_status'] == 'COMPLETED': if item['transcribe_transcript'] == 'IN_PROGRESS': print('{} needs to be updated dynamo \t transcribe_status status: {} \t transcribe_transcript: {}'.format(item['id_filename'], item['transcribe_status'], item['transcribe_transcript'])) if 'framerate' in item: framerate = item['framerate'] transcribe_job = transcribe.get_transcription_job(TranscriptionJobName=item['transcribe_job']) # print('transcribe_transcript_uri: {}'.format(transcribe_job['TranscriptionJob']['Transcript']['TranscriptFileUri'])) transcribe_status = transcribe_job['TranscriptionJob']['TranscriptionJobStatus'] transcribe_transcript_uri = transcribe_job['TranscriptionJob']['Transcript']['TranscriptFileUri'] transcribe_transcript_string = get_url(transcribe_transcript_uri) transcribe_transcript_dict = json.loads(transcribe_transcript_string) transcribe_transcript_dirty = transcribe_transcript_dict['results'] # print('transcribe_transcript_dirty: {}'.format(transcribe_transcript_dirty)) transcribe_transcript = stripper(transcribe_transcript_dirty) # print('transcribe_transcript: {}'.format(transcribe_transcript)) datetime_start = datetime.datetime.strptime(item['timestamp_start'], '%Y-%m-%dT%H:%M:%S.%fZ') ## SENTENCES if 'transcripts' in transcribe_transcript: comprehend_dict = {} for transcripts in transcribe_transcript['transcripts']: if 'transcript' in transcripts: transcript = transcripts['transcript'] transcript = unicode(transcript.encode("utf-8")[:999], "utf-8", errors="ignore") source_lang = 'en' detect_dominant_language = comprehend.detect_dominant_language(Text=transcript)['Languages'][0] detect_dominant_language['Score'] = str(detect_dominant_language['Score']) comprehend_dict['detect_dominant_language'] = detect_dominant_language source_lang = detect_dominant_language['LanguageCode'] # key_phrases key_phrases = comprehend.detect_key_phrases(Text=transcript,LanguageCode=source_lang)['KeyPhrases'] for key_phrase in key_phrases: key_phrase['Score'] = str(key_phrase['Score']) comprehend_dict['key_phrases'] = key_phrases # sentiment detect_sentiment = comprehend.detect_sentiment(Text=transcript,LanguageCode=source_lang) comprehend_dict['sentiment'] = detect_sentiment['Sentiment'] for key,value in detect_sentiment['SentimentScore'].iteritems(): detect_sentiment['SentimentScore'][key] = str(value) comprehend_dict['sentiment_score'] = detect_sentiment['SentimentScore'] # detect_entities detect_entities = comprehend.detect_entities(Text=transcript,LanguageCode=source_lang)['Entities'] for detect_entity in detect_entities: detect_entity['Score'] = str(detect_entity['Score']) comprehend_dict['detect_entities'] = detect_entities transcripts['comprehend'] = comprehend_dict ## translate transcripts[source_lang] = transcript target_lang = 'ar' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] target_lang = 'zh' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] target_lang = 'fr' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] target_lang = 'de' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] target_lang = 'pt' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] target_lang = 'es' transcripts[target_lang] = translate.translate_text(Text=transcript, SourceLanguageCode=source_lang, TargetLanguageCode=target_lang)['TranslatedText'] else: print('no transript in transcripts') # pprint(transcribe_transcript['transcripts']) ## WORDS if 'items' in transcribe_transcript: last_end_time = 0 for titem in transcribe_transcript['items']: if titem['type'] == 'pronunciation': titem['pdt'] = { 'start_time' : str(datetime.timedelta(seconds=float(titem['start_time'])) + datetime_start).rstrip('0').replace(' ', 'T') + 'Z', 'end_time' : str(datetime.timedelta(seconds=float(titem['end_time'])) + datetime_start).rstrip('0').replace(' ', 'T') + 'Z' } titem['emp'] = { 'start_time' : (datetime.timedelta(seconds=float(titem['start_time'])) + datetime_start).strftime("%Y-%m-%d %H:%M:%S").replace(' ', 'T') + '+00:00', 'end_time' : (datetime.timedelta(seconds=float(titem['end_time'])) + datetime_start).strftime("%Y-%m-%d %H:%M:%S").replace(' ', 'T') + '+00:00' } titem['emc'] = { 'start_time' : (datetime.timedelta(seconds=float(titem['start_time'])) + datetime_start).strftime("%H:%M:%S:").replace(' ', 'T') + str(int(math.floor(float((datetime.timedelta(seconds=float(titem['start_time'])) + datetime_start).strftime("0.%f")) * float(framerate)))).zfill(2), 'end_time' : (datetime.timedelta(seconds=float(titem['end_time'])) + datetime_start).strftime("%H:%M:%S:").replace(' ', 'T') + str(int(math.ceil(float((datetime.timedelta(seconds=float(titem['end_time'])) + datetime_start).strftime("0.%f")) * float(framerate)))).zfill(2) } last_end_time = titem['end_time'] # print('start_time_pdt: {} \t end_time: {}'.format(start_time_pdt,end_time_pdt)) if titem['type'] == 'punctuation': titem['start_time'] = last_end_time titem['end_time'] = last_end_time titem['pdt'] = { 'start_time' : str(datetime.timedelta(seconds=float(last_end_time)) + datetime_start).rstrip('0').replace(' ', 'T') + 'Z', 'end_time' : str(datetime.timedelta(seconds=float(last_end_time)) + datetime_start).rstrip('0').replace(' ', 'T') + 'Z' } titem['emp'] = { 'start_time' : (datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("%Y-%m-%d %H:%M:%S").replace(' ', 'T') + '+00:00', 'end_time' : (datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("%Y-%m-%d %H:%M:%S").replace(' ', 'T') + '+00:00' } titem['emc'] = { 'start_time' : (datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("%H:%M:%S:").replace(' ', 'T') + str(int(math.floor(float((datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("0.%f")) * float(framerate)))).zfill(2), 'end_time' : (datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("%H:%M:%S:").replace(' ', 'T') + str(int(math.ceil(float((datetime.timedelta(seconds=float(last_end_time)) + datetime_start).strftime("0.%f")) * float(framerate)))).zfill(2) } # pprint(transcribe_transcript) update_dyanmo_main( {'id_filename' : item['id_filename'], 'transcribe_status' : transcribe_status, 'transcribe_transcript' : transcribe_transcript } ) else: print('{} dynamo already been updated... skipping '.format(item['id_filename'])) # return 'fake stop point' def transcribe_submit(): #### Transcribe Job Design print("query of table: " + str(DYNAMO_MAIN)) table = dynamodb.Table(DYNAMO_MAIN) response = table.query( Limit=20, IndexName=DYNAMO_MAIN_GSI, ScanIndexForward=False, KeyConditionExpression=Key('id_type').eq('segment'), ) full_list = {} framerate = '30.0' for item in response['Items']: if 'framerate' in item: framerate = item['framerate'] if item['transcribe_status'] == 'NOT_SUBMITTED': if item['timestamp_minute'] not in full_list: full_list[item['timestamp_minute']] = {'duration': float(0), 'scenechanges': [], 'timestamps': {} } full_list[item['timestamp_minute']]['timestamps'].update({str(item['timestamp_pdt']) : item['audio_file']}) full_list[item['timestamp_minute']]['duration'] += float(item['duration']) full_list[item['timestamp_minute']]['scenechanges'] = full_list[item['timestamp_minute']]['scenechanges'] + item['scenechange_list'] # pprint(full_list) ## set tmp directory tmpdir = '/tmp/' + str(uuid.uuid4()) + '/' ensure_dir(tmpdir) files_to_delete = [] for minute, seconds in full_list.iteritems(): if(seconds['duration'] > 55): rekog_list = [] rekog_word_list = [] rekog_celeb_list = [] rekog_facial_list = [] scene_list = [] for scenechange in seconds['scenechanges']: print("query of table: " + str(DYNAMO_MAIN)) table = dynamodb.Table(DYNAMO_MAIN) response = table.query( KeyConditionExpression=Key('id_filename').eq(scenechange), ) for item in response['Items']: for rekog_label in item['rekog_labels']: rekog_label['timestamp'] = item['timestamp_minute'] + ':' + item['timestamp_second'].zfill(2) + ':' + item['timestamp_frame'].zfill(2) rekog_list.append(rekog_label) if 'rekog_words' in item: for rekog_word in item['rekog_words']: rekog_word['timestamp'] = item['timestamp_minute'] + ':' + item['timestamp_second'].zfill(2) + ':' + item['timestamp_frame'].zfill(2) rekog_word_list.append(rekog_word) if 'rekog_celebs' in item: for rekog_celeb in item['rekog_celebs']: rekog_celeb['timestamp'] = item['timestamp_minute'] + ':' + item['timestamp_second'].zfill(2) + ':' + item['timestamp_frame'].zfill(2) rekog_celeb_list.append(rekog_celeb) if 'rekog_facial' in item: for rekog_facial in item['rekog_facial']: # rekog_facial['timestamp'] = item['timestamp_minute'] + ':' + item['timestamp_second'].zfill(2) + ':' + item['timestamp_frame'].zfill(2) rekog_facial_list.append(rekog_facial) scene_list.append({'scene': item['id_filename'], 'strength': item['scenedetect'], 'timestamp': item['timestamp_minute'] + ':' + item['timestamp_second'].zfill(2) + ':' + item['timestamp_frame'].zfill(2)}) files_to_concat = [] list_of_timestamps = [] for second, audiofile in seconds['timestamps'].iteritems(): if second is not 'duration': print('downloading audio for minute: {}, second: {}, filename: {}, in dir: {}'.format(minute, second, audiofile, tmpdir)) get_s3file(S3_BUCKET, 'audio/' + audiofile, tmpdir + audiofile) list_of_timestamps.append(str(second)) files_to_concat.append(str(tmpdir + audiofile)) files_to_delete.append(str(tmpdir + audiofile)) update_dyanmo_main( {'id_filename' : audiofile.replace('.wav', '.ts'), 'transcribe_status' : 'CONCAT', 'transcribe_transcript' : 'CONCAT' } ) files_to_concat.sort() list_of_timestamps.sort() # pprint(list_of_timestamps) print('first timestamp is: {}'.format(list_of_timestamps[0])) timestamp_start = list_of_timestamps[0] concat_string = SOX + ' ' for file_to_concat in files_to_concat: concat_string += file_to_concat + ' ' file_to_upload = 'stream_' + minute.replace('-','').replace(' ','').replace(':','') concat_string += tmpdir + file_to_upload +'.wav 2>&1' # pprint(concat_string) sox_output = os.popen( concat_string ).read() # print('sox output: {}'.format(sox_output)) data = open(tmpdir + file_to_upload + '.wav', 'rb') pprint(s3.Bucket(S3_BUCKET).put_object(Key='audio/' + file_to_upload + '.wav', Body=data)) files_to_delete.append(str(tmpdir + file_to_upload + '.wav')) transcribe_status = 'NOT_SUBMITTED' job_name = file_to_upload job_uri = "https://s3.amazonaws.com/" + S3_BUCKET + "/audio/" + file_to_upload + ".wav" try: transcribe.start_transcription_job( TranscriptionJobName=job_name, Media={'MediaFileUri': job_uri}, MediaFormat='wav', LanguageCode='en-US', MediaSampleRateHertz=16000 ) except botocore.exceptions.ClientError as e: pprint(e) transcribe_job = transcribe.get_transcription_job(TranscriptionJobName=job_name) print('transcribe status: {}'.format(transcribe_job['TranscriptionJob']['TranscriptionJobStatus'])) transcribe_status = transcribe_job['TranscriptionJob']['TranscriptionJobStatus'] dynamo_segment_object={ 'id_filename': file_to_upload + '.wav', 'id_type': 'minute', 'timestamp_minute': minute, 'framerate' : framerate, 'timestamp_start' : timestamp_start, 'duration' : str(seconds['duration']), 'audio_file': file_to_upload, 'transcribe_job' : job_name, 'transcribe_status' : transcribe_status, 'rekog_labels' : rekog_list, 'rekog_words' : rekog_word_list, 'rekog_celebs' : rekog_celeb_list, 'rekog_facial' : rekog_facial_list, 'scenes' : scene_list, 'timestamp_created' : int((datetime.datetime.utcnow() - datetime.datetime(1970,1,1)).total_seconds()), 'timestamp_ttl' : int((datetime.datetime.utcnow() - datetime.datetime(1970,1,1)).total_seconds() + MASTER_TTL) # 2 hours } put_dynamo_main(dynamo_segment_object) else: print('skipping due to not enough seconds for the minute') for file_to_delete in files_to_delete: delete_file(file_to_delete) def write_json(): ## set tmp directory tmpdir = '/tmp/' + str(uuid.uuid4()) + '/' ensure_dir(tmpdir) rekog_type = 'minute' print("query of table: " + str(DYNAMO_MAIN)) table = dynamodb.Table(DYNAMO_MAIN) response = table.query( Limit=45, IndexName=DYNAMO_MAIN_GSI, ScanIndexForward=False, KeyConditionExpression=Key('id_type').eq(rekog_type), ) json_string = json.dumps(response['Items'], cls=DecimalEncoder) dynamo_filename = 'list-' + rekog_type + '.json' with open(tmpdir + dynamo_filename, 'w') as outfile: outfile.write(json_string) data = open(tmpdir + dynamo_filename, 'rb') pprint(s3.Bucket(S3_BUCKET).put_object(Key=dynamo_filename, Body=data)) delete_file(tmpdir + dynamo_filename) def lambda_handler(event, context): if XRAY == 'true': patch_all() get_environment_variables() transcribe_results() transcribe_submit() write_json() return 'SUCCESS: it ran' if __name__ == '__main__': ''' This is to run for local testing ''' XRAY = 'false' # stop xray when doing local testing FFPROBE = 'ffprobe' # use local mac version FFMPEG = 'ffmpeg' # use local mac version SOX = 'sox' # use local mac version os.environ['DYNAMO_MAIN'] = 'nab2018-catfinder5003-main' os.environ['S3_BUCKET'] = 'nab2018-catfinder5003' print(lambda_handler(None, None)) with open('deploy', 'w') as outfile: outfile.write('lambda-uploader --variables \'{"S3_BUCKET": "' + S3_BUCKET + '","DYNAMO_MAIN": "' + DYNAMO_MAIN + '" }\'') with open('logs', 'w') as outfile: outfile.write('awslogs get /aws/lambda/catfinder5002-transcribe ALL --watch')