python source code of upload

from __future__ import unicode_literals
'''
Functions for reading, sampling, and detecting types of datasets

No manipulation or calculation, only description
'''

import os
import re
import csv
import xlrd
import json
import codecs
try:
    import cStringIO as StringIO
except:
    import StringIO
import boto3
import chardet
import pandas as pd

from werkzeug.utils import secure_filename
from flask import current_app

from dive.base.core import s3_client, compress
from dive.base.db import db_access
from dive.base.exceptions import UploadTooLargeException
from dive.worker.core import celery, task_app
from dive.base.data.access import get_data
from dive.base.data.in_memory_data import InMemoryData as IMD

import logging
logger = logging.getLogger(__name__)


def save_fileobj_to_s3(fileobj, project_id, file_name):
    try:
        url = s3_client.generate_presigned_url(
            'get_object',
            Params = {
                'Bucket': current_app.config['AWS_DATA_BUCKET'],
                'Key': file_name
            },
            ExpiresIn = 86400
        )

        s3_client.upload_fileobj(
            fileobj,
            current_app.config['AWS_DATA_BUCKET'],
            "%s/%s" % (project_id, file_name)
        )
    except Exception as e:
        logger.error(e, exc_info=True)
    return url


def upload_file(project_id, file_obj):
    '''
    1. Save file in uploads/project_id directory
    2. If excel or json, also save CSV versions
    3. If all steps are successful, save file location in project data collection

    file_name = foo.csv
    file_title = foo
    '''
    file_name = secure_filename(file_obj.filename)
    file_title, file_type = file_name.rsplit('.', 1)

    # Pre-save properties
    if current_app.config['STORAGE_TYPE'] == 'file':
        path = os.path.join(current_app.config['STORAGE_PATH'], str(project_id), file_name)
    elif current_app.config['STORAGE_TYPE'] == 's3':
        path = 'https://s3.amazonaws.com/%s/%s/%s' % (current_app.config['AWS_DATA_BUCKET'], str(project_id), file_name)

    # Persisting file and saving to DB
    datasets = save_dataset_to_db(
        project_id,
        file_obj,
        file_title,
        file_name,
        file_type,
        path,
        current_app.config['STORAGE_TYPE']
    )
    file_obj.close()
    return datasets


def get_encoding(file_obj, sample_size=100*1024*1024):
    try:
        blob = file_obj.read(sample_size)
        file_obj.seek(0)
    except:
        blob = file_obj

    encoding = chardet.detect(blob)
    return encoding['encoding']


def get_dialect(file_obj, sample_size=1024*1024):
    try:
        sample = file_obj.read(sample_size)
    except StopIteration:
        sample = file_obj.readline()
    file_obj.seek(0)

    sniffer = csv.Sniffer()
    dialect = sniffer.sniff(sample)

    result = {
        'delimiter': dialect.delimiter,
        'doublequote': dialect.doublequote,
        'escapechar': dialect.escapechar,
        'lineterminator': dialect.lineterminator,
        'quotechar': dialect.quotechar,
    }
    return result


def save_dataset_to_db(project_id, file_obj, file_title, file_name, file_type, path, storage_type, limit_flat_file_size=False):
    encoding = 'utf-8'

    # Default dialect (for Excel and JSON conversion)
    dialect = {
        "delimiter": ",",
        "quotechar": "\"",
        "escapechar": None,
        "doublequote": False,
        "lineterminator": "\r\n"
    }

    file_docs = []

    if file_type in ['csv', 'tsv', 'txt']:
        dialect = get_dialect(file_obj)
        encoding = get_encoding(file_obj)
        file_obj.seek(0)

        # Force encoding to UTF-8
        if (encoding not in ['ascii']) and ('utf' not in encoding) and ('UTF' not in encoding):
            try:
                coerced_content = file_obj.read().decode(encoding).encode('utf-8', 'replace')
            except UnicodeDecodeError as UDE:
                try:
                    coerced_content = unicode(file_obj.read(), errors='replace')
                except UnicodeDecodeError as UDE:
                    logger.error('Error coercing unicode for file with path %s', path, exc_info=True)
                    raise UDE
            file_obj.seek(0)
            file_obj.write(coerced_content)
            file_obj.seek(0)

        if limit_flat_file_size:  # False by default, because implemented on front-end
            rows = file_obj.readlines()
            cols = rows[0].split(dialect['delimiter'])
            num_rows = len(rows)
            num_cols = len(cols)
            file_obj.seek(0)

            if (num_rows > current_app.config['ROW_LIMIT']):
                raise UploadTooLargeException('Uploaded file has {} rows, exceeding row limit of {}'.format(num_rows, current_app.config['ROW_LIMIT']))
            elif (num_cols > current_app.config['COLUMN_LIMIT']):
                raise UploadTooLargeException('Uploaded file has {} columns, exceeding row limit of {}'.format(num_cols, current_app.config['COLUMN_LIMIT']))

        file_doc = save_flat_table(project_id, file_obj, file_title, file_name, file_type, path)
        file_docs.append(file_doc)

    elif file_type.startswith('xls'):
        file_docs = save_excel_to_csv(project_id, file_obj, file_title, file_name, file_type, path)

    elif file_type == 'json':
        file_doc = save_json_to_csv(project_id, file_obj, file_title, file_name, file_type, path)
        file_docs.append(file_doc)

    datasets = []
    for file_doc in file_docs:
        path = file_doc['path']

        dataset = db_access.insert_dataset(project_id,
            path = path,
            encoding = encoding,
            dialect = dialect,
            offset = None,
            title = file_doc['file_title'],
            file_name = file_doc['file_name'],
            type = file_doc['type'],
            storage_type = storage_type
        )
        datasets.append(dataset)

    return datasets


def save_flat_table(project_id, file_obj, file_title, file_name, file_type, path):
    file_doc = {
        'file_title': file_title,
        'file_name': file_name,
        'type': file_type,
        'path': path
    }

    if current_app.config['STORAGE_TYPE'] == 'file':
        try:
            file_obj.save(path)
        except IOError:
            logger.error('Error saving file with path %s', path, exc_info=True)
    elif current_app.config['STORAGE_TYPE'] == 's3':
        url = save_fileobj_to_s3(file_obj, project_id, file_name)
    return file_doc


def save_excel_to_csv(project_id, file_obj, file_title, file_name, file_type, path):
    book = xlrd.open_workbook(file_contents=file_obj.read())
    sheet_names = book.sheet_names()

    file_docs = []
    for sheet_name in sheet_names:
        sheet = book.sheet_by_name(sheet_name)

        if (sheet.nrows > current_app.config['ROW_LIMIT']):
            raise UploadTooLargeException('Uploaded file has {} rows, exceeding row limit of {}'.format(sheet.nrows, current_app.config['ROW_LIMIT']))
        elif (sheet.ncols > current_app.config['COLUMN_LIMIT']):
            raise UploadTooLargeException('Uploaded file has {} columns, exceeding row limit of {}'.format(sheet.ncols, current_app.config['COLUMN_LIMIT']))

        if sheet.nrows == 0: continue
        csv_file_title = file_name + "_" + sheet_name
        csv_file_name = csv_file_title + ".csv"

        if current_app.config['STORAGE_TYPE'] == 's3':
            csv_path = 'https://s3.amazonaws.com/%s/%s/%s' % (current_app.config['AWS_DATA_BUCKET'], str(project_id), csv_file_name)
            strIO = StringIO.StringIO()
            wr = csv.writer(strIO, quoting=csv.QUOTE_ALL)
            for rn in xrange(sheet.nrows) :
                wr.writerow([ unicode(v).encode('utf-8', 'replace') for v in sheet.row_values(rn) ])
            strIO.seek(0)

            response = s3_client.upload_fileobj(
                strIO,
                current_app.config['AWS_DATA_BUCKET'],
                '%s/%s' % ( str(project_id), csv_file_name )
            )

        if current_app.config['STORAGE_TYPE'] == 'file':
            csv_path = os.path.join(current_app.config['STORAGE_PATH'], str(project_id), csv_file_name)
            csv_file = open(csv_path, 'wb')
            wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
            for rn in xrange(sheet.nrows) :
                wr.writerow([ unicode(v).encode('utf-8') for v in sheet.row_values(rn) ])
            csv_file.close()

        file_doc = {
            'file_title': csv_file_title,
            'file_name': csv_file_name,
            'path': csv_path,
            'type': 'csv',
            'orig_type': 'xls'
        }
        file_docs.append(file_doc)
    return file_docs


def save_json_to_csv(project_id, file_obj, file_title, file_name, file_type, path):
    f = open(path, 'rU')
    json_data = json.load(f)

    orig_type = file_name.rsplit('.', 1)[1]
    csv_file_title = file_title
    csv_file_name = csv_file_title + ".csv"
    csv_path = os.path.join(current_app.config['STORAGE_PATH'], project_id, csv_file_name)

    csv_file = open(csv_path, 'wb')
    wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)

    header = json_data[0].keys()
    num_rows = len(json_data)
    num_cols = len(header)

    if (num_rows > current_app.config['ROW_LIMIT']):
        raise UploadTooLargeException('Uploaded file has {} rows, exceeding row limit of {}'.format(num_rows, current_app.config['ROW_LIMIT']))
    elif (num_cols > current_app.config['COLUMN_LIMIT']):
        raise UploadTooLargeException('Uploaded file has {} columns, exceeding row limit of {}'.format(num_cols, current_app.config['COLUMN_LIMIT']))

    wr.writerow([v.encode('utf-8') for v in header])
    for i in range(len(json_data)) :
        row = []
        for field in header :
            row.append(json_data[i][field])
        wr.writerow([unicode(v).encode('utf-8') for v in row])
    csv_file.close()
    file_doc = {
        'title': csv_file_title,
        'file_name': csv_file_name,
        'path': csv_path,
        'type': 'csv',
        'orig_type': 'json'
    }
    return file_doc