Python pandas.unique() Examples

The following are code examples for showing how to use pandas.unique(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: coquery   Author: gkunter   File: visualizer.py    (license) View Source Project 6 votes vote down vote up
def get_levels(self, name):
        """
        Return a set containing all distinct values in the column 'name'.

        The values are returned in alphabetical order.

        Parameters
        ----------
        name : string
            The column name for which the unique values are requested

        Returns
        -------
        levels : list
            A unique list of all values that are contained in the specified
            data column.
        """
        return pd.unique(self._table[name].values.ravel()) 
Example 2
Project: deep-action-proposals   Author: escorciav   File: daps_detection.py    (license) View Source Project 6 votes vote down vote up
def wrapper_nms(proposal_df, overlap=0.65):
    """Apply non-max-suppresion to a video batch.
    """
    vds_unique = pd.unique(proposal_df['video-name'])
    new_proposal_df = []
    for i, v in enumerate(vds_unique):
        idx = proposal_df['video-name'] == v
        p = proposal_df.loc[idx, ['video-name', 'f-init', 'f-end',
                                  'score', 'video-frames']]
        n_frames = np.int(p['video-frames'].mean())
        loc = np.stack((p['f-init'], p['f-end']), axis=-1)
        loc, score = nms_detections(loc, np.array(p['score']), overlap)
        n_proposals = score.shape[0]
        n_frames = np.repeat(p['video-frames'].mean(), n_proposals).astype(int)
        this_df = pd.DataFrame({'video-name': np.repeat(v, n_proposals),
                                'f-init': loc[:, 0], 'f-end': loc[:, 1],
                                'score': score,
                                'video-frames': n_frames})
        new_proposal_df.append(this_df)
    return pd.concat(new_proposal_df, axis=0) 
Example 3
Project: gullikson-scripts   Author: kgullikson88   File: CCF_Systematics.py    (license) View Source Project 6 votes vote down vote up
def get_detected_objects(df, tol=1.0, debug=False):
    """
    Takes a summary dataframe with RV information. Finds the median rv for each star,
      and removes objects that are more than 'tol' km/s from the median value
    :param df: A summary dataframe, such as created by get_ccf_summary or find_best_pars
    :param tol: The tolerance, in km/s, to accept an observation as detected
    :return: a dataframe containing only detected companions
    """
    secondary_names = pd.unique(df.Secondary)
    secondary_to_rv = defaultdict(float)
    for secondary in secondary_names:
        rv = df.loc[df.Secondary == secondary]['rv'].median()
        secondary_to_rv[secondary] = rv

    if debug:
        for secondary in sorted(secondary_to_rv.keys()):
            print ('RV for {}: {:.2f} km/s'.format(secondary, secondary_to_rv[secondary]))

    keys = df.Secondary.values
    good = df.loc[abs(df.rv.values - np.array(itemgetter(*keys)(secondary_to_rv))) < tol]
    return good 
Example 4
Project: gullikson-scripts   Author: kgullikson88   File: HDF5_Helpers.py    (license) View Source Project 6 votes vote down vote up
def list_stars(self, print2screen=False):
        """
        List all of the stars in all of the CCF interfaces

        Parameters:
        ===========
        - print2screen:     bool
                            Should we print the stars and dates to screen?

        Returns:
        =========
        - star_list:        list
                            A list of every star in the file, sorted by name.
        """
        stars = []
        for inst in self._interfaces.keys():
            if print2screen:
                print('Stars observed with {}: \n============================\n\n'.format(inst))
            stars.extend(self._interfaces[inst].list_stars(print2screen=print2screen))

        return list(pd.unique(stars)) 
Example 5
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 6 votes vote down vote up
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np.array(['2015-01-03T00:00:00.000000000+0000',
                             '2015-01-01T00:00:00.000000000+0000'],
                            dtype='M8[ns]')

        dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000',
                                   '2015-01-01T00:00:00.000000000+0000'])
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        s = pd.Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype) 
Example 6
Project: KaggleExeter   Author: detomo   File: build_features.py    (license) View Source Project 6 votes vote down vote up
def rename_brands(phone_models):
	""" recast all phone brands and model as string integers brand_i and model_j """
	brands_table = {}
	i = 0
	for brand in pd.unique(phone_models['phone_brand']):
		brands_table[brand] = 'brand_%s' %i
		i += 1

	models_table = {}
	i = 0
	for model in pd.unique(phone_models['device_model']):
		models_table[model] = 'model_%s' %i
		i += 1

	converted = []
	for item in zip(phone_models['phone_brand'],phone_models['device_model']):
		converted.append((brands_table[item[0]],models_table[item[1]]))
	phone_models['phone_brand'] = [x[0] for x in converted]
	phone_models['device_model'] = [x[1] for x in converted]
	return phone_models 
Example 7
Project: KaggleExeter   Author: detomo   File: test_submission.py    (license) View Source Project 6 votes vote down vote up
def rename_brands(phone_models):
	""" recast all phone brands and model as string integers brand_i and model_j """
	brands_table = {}
	i = 0
	for brand in pd.unique(phone_models['phone_brand']):
		brands_table[brand] = 'brand_%s' %i
		i += 1

	models_table = {}
	i = 0
	for model in pd.unique(phone_models['device_model']):
		models_table[model] = 'model_%s' %i
		i += 1

	converted = []
	for item in zip(phone_models['phone_brand'],phone_models['device_model']):
		converted.append((brands_table[item[0]],models_table[item[1]]))
	phone_models['phone_brand'] = [x[0] for x in converted]
	phone_models['device_model'] = [x[1] for x in converted]
	return phone_models 
Example 8
Project: calvin   Author: ucd-cws   File: calvin.py    (license) View Source Project 6 votes vote down vote up
def __init__(self, linksfile, ic=None):
    df = pd.read_csv(linksfile)
    df['link'] = df.i.map(str) + '_' + df.j.map(str) + '_' + df.k.map(str)
    df.set_index('link', inplace=True)

    self.df = df

    # self.T = len(self.df)
    SR_stats = pd.read_csv('calvin/data/SR_stats.csv', index_col=0).to_dict()
    self.min_storage = SR_stats['min']
    self.max_storage = SR_stats['max']

    if ic:
      self.apply_ic(ic)

    # a few network fixes to make things work
    self.add_ag_region_sinks()
    self.fix_hydropower_lbs()

    self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
    self.links = list(zip(df.i,df.j,df.k))
    self.networkcheck() # make sure things aren't broken 
Example 9
Project: skp_edu_docker   Author: TensorMSA   File: data_node_frame.py    (license) View Source Project 6 votes vote down vote up
def make_unique_value_each_column (self, df, node_id):
        """ Dataframe? ??? ???? ??? ??? ?? ??? ???? 
            Unique Value return in Dataframe
        Args:
          params:
            * df : dataframe
            * node_id: nnid
        Returns:
            json
        Raises:
        """
        try:
            data_conf = dict()
            column_cate_unique = dict()
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            for i, v in df.dtypes.iteritems():
                if (str(v) not in numerics):  # maybe need float
                    column_cate_unique[i] = df[i].unique().size
            data_conf['unique_cell_feature'] = column_cate_unique
            data_conf_json_str = json.dumps(data_conf)
            data_conf_json = json.loads(data_conf_json_str)
            return data_conf_json
        except Exception as e:
            logging.error("make_unique_value_each_column error : {0}, {1}".format(i,v))
            raise e 
Example 10
Project: extract   Author: dblalock   File: unsupervised.py    (license) View Source Project 6 votes vote down vote up
def makeTable(df, rowsCol, colsCol, dataCol):
	# df.set_index(rowsCol)

	uniqRowVals = pd.unique(df[rowsCol])
	uniqColVals = pd.unique(df[colsCol])

	# "rows col = ", df[rowsCol]
	# print "uniq row vals", uniqRowVals
	# print "uniq col vals", uniqColVals
	# print df[[rowsCol, colsCol, dataCol]]

	out = pd.DataFrame(index=uniqRowVals, columns=uniqColVals)
	for rowVal in uniqRowVals:
		for colVal in uniqColVals:
			rowsMatch = df[rowsCol] == rowVal
			colsMatch = df[colsCol] == colVal
			thisIdx = np.where(rowsMatch * colsMatch)[0][0]
			out.ix[rowVal][colVal] = df[dataCol][thisIdx]

	return out 
Example 11
Project: StackedDAE   Author: glrs   File: utils.py    (license) View Source Project 6 votes vote down vote up
def label_metadata(label_matrix, label_col):
    # Check whether the column value is given as index (number) or name (string) 
    try:
        label_col = int(label_col)
        
        # If given as number, take the name of the column out of it
        label_col = label_matrix.columns[label_col]
    except ValueError:
        pass
    
    import pandas as pd
    # Get the unique classes in the given column, and how many of them are there
    unique_classes = pd.unique(label_matrix[label_col].ravel())
    #num_classes = unique_classes.shape[0]
    
    # Map the unique n classes with a number from 0 to n  
    label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
    
    # Replace the given column's values with the mapped equivalent
    mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
    
    # Return the mapped labels as numpy list and the label map (unique classes and number can be obtained from map)
    return np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],)), np.asarray(label_map) #, unique_classes, num_classes 
Example 12
Project: StackedDAE   Author: glrs   File: data_handler.py    (license) View Source Project 6 votes vote down vote up
def label_metadata(label_matrix, label_col):
    # Check whether the column value is given as index (number) or name (string) 
    try:
        label_col = int(label_col)
        
        # If given as number, take the name of the column out of it
        label_col = label_matrix.columns[label_col]
    except ValueError:
        pass

    # Get the unique classes in the given column, and how many of them are there
    unique_classes = pd.unique(label_matrix[label_col].ravel())
    
    # Map the unique n classes with a number from 0 to n
    label_map = pd.DataFrame({label_col: unique_classes, label_col+'_id':range(len(unique_classes))})
    
    # Replace the given column values with the mapped equivalent
    mapped_labels = label_matrix.replace(label_map[[0]].values.tolist(), label_map[[1]].values.tolist())
#     print("label_matrix", label_matrix)
#     print("mapped_labels", mapped_labels)

    # Return the mapped labels as ndarray and the label map (unique classes and number can be obtained from map)
    # np.reshape(mapped_labels[[label_col]].values, (mapped_labels.shape[0],))
    # Return the mapped labels as DataFrame and the label map (unique classes and number can be obtained from map)
    return mapped_labels[[label_col]], np.asarray(label_map) #, unique_classes, num_classes 
Example 13
Project: triage   Author: dssg   File: create_inspections_subset.py    (license) View Source Project 5 votes vote down vote up
def create_subset(src, dest, n=250):
    "Given a csv file `src`, create a subset `dest` with `n` unique entities"
    df = pd.read_csv(src)
    lics = pd.unique(df["License #"])
    sublics = lics[random.sample(range(0,len(lics)), n)]
    subset = df[df["License #"].isin(sublics)]
    # Make the column names a little more readable
    subset.columns = map(clean_column_name, subset.columns)
    subset.to_csv(dest, index=False) 
Example 14
Project: johnson-county-ddj-public   Author: dssg   File: feature_processor.py    (license) View Source Project 5 votes vote down vote up
def convert_categorical(df):
    onecol = df.columns[1]
    onecol_name = df.columns.values.tolist()[1]
    df[onecol] = df[onecol].str.lower()
    categories = pd.unique(df[onecol])


    categories = [x for x in categories if x is not None]

    try:
        categories.remove(' ')
    except:
        pass

    categories = [str(x) for x in categories]

    categories = list(set([str.lower(x).strip() for x in categories]))

    #replaces spaces in middle of word w underscores
    categories = list(set([x.replace(" ", '_') for x in categories]))

    featnames = []
    for i in range(len(categories)):
        if type(categories[i]) is str:
            newfeatstr = onecol_name+'_is_' + categories[i] 
            featnames.append(newfeatstr)
            df[newfeatstr] = (df[onecol] == categories[i])

    onecol_null = onecol_name + "_is_null"
    df[onecol_null] = pd.isnull(df[onecol])
    df[onecol_null] = df[onecol_null].astype(float)
    df = df.drop(onecol, axis=1)
    df[featnames] = df[featnames].astype(float)
    df = df.groupby(config_db['id_column'], sort = False, as_index=False)[featnames].max()
    return df, featnames 
Example 15
Project: coquery   Author: gkunter   File: visualizer.py    (license) View Source Project 5 votes vote down vote up
def _validate_layout(func):
        def func_wrapper(self):
            if self._col_wrap:
                if self._col_wrap > 16:
                    raise VisualizationInvalidLayout
                else:
                    return func(self)
            if self._col_factor and len(pd.unique(self._table[self._col_factor].values.ravel())) > 16:
                raise VisualizationInvalidLayout
            if self._row_factor and len(pd.unique(self._table[self._row_factor].values.ravel())) > 16:
                raise VisualizationInvalidLayout
            return func(self)
        return func_wrapper 
Example 16
Project: fstd2nc   Author: neishm   File: __init__.py    (license) View Source Project 5 votes vote down vote up
def vectorize (f):
  from functools import wraps
  try:
    from pandas import Series, unique
    @wraps(f)
    def vectorized_f (x):
      # If we're given a scalar value, then simply return it.
      if not hasattr(x,'__len__'):
        return f(x)
      # Get unique values
      inputs = unique(x)
      outputs = map(f,inputs)
      table = dict(zip(inputs,outputs))
      result = Series(x).map(table)
      return result.values
  except ImportError:
    def cached_f(x, cache={}):
      if x not in cache:
        cache[x] = f(x)
      return cache[x]
    @wraps(f)
    def vectorized_f (x):
      # If we're given a scalar value, then simply return it.
      if not hasattr(x,'__len__'):
        return cached_f(x)
      return map(cached_f,x)
  return vectorized_f


# The type of data returned by the Buffer iterator. 
Example 17
Project: ImgAnnotaPyQt4   Author: ZhengRui   File: annotaMain.py    (license) View Source Project 5 votes vote down vote up
def saveLabel(self):
        if not len(self.labelFile):
            self.labelFile = QtGui.QFileDialog.getSaveFileName(self, 'Save Label File', os.path.expanduser('~'), 'Txt (*.txt)')

        if len(self.labelFile):
            self.updateLabelsBuf()
            if self.labelsBuf is not None:
                if self.labels is None:
                    self.labels = self.labelsBuf
                
                self.labels = self.labels[~self.labels.image.isin(pd.unique(self.labelsBuf.image.ravel()))]
                self.labelsBuf = self.labelsBuf[self.labelsBuf.cateid.notnull()]
                self.labels = self.labels.append(self.labelsBuf, ignore_index=True)
                self.labels.to_csv(self.labelFile, index=False)
                self.labelsBuf = self.labelsBuf[self.labelsBuf.image == os.path.basename(self.imgsList[self.ith])] 
Example 18
Project: gullikson-scripts   Author: kgullikson88   File: CCF_Systematics.py    (license) View Source Project 5 votes vote down vote up
def add_actual_temperature(df, method='excel', filename='SecondaryStar_Temperatures.xls'):
    """
    Add the actual temperature to a given summary dataframe
    :param df: The dataframe to which we will add the actual secondary star temperature
    :keyword method: How to get the actual temperature. Options are:
                   - 'spt': Use main-sequence relationships to go from spectral type --> temperature
                   - 'excel': Use tabulated data, available in the file 'SecondaryStar_Temperatures.xls'
    :keyword filename: The filename of the excel spreadsheet containing the literature temperatures.
                       Needs to have the right format! Ignored if method='spt'
    :return: copy of the original dataframe, with an extra column for the secondary star temperature
    """
    # First, get a list of the secondary stars in the data
    secondary_names = pd.unique(df.Secondary)
    secondary_to_temperature = defaultdict(float)
    secondary_to_error = defaultdict(float)

    if method.lower() == 'spt':
        MS = SpectralTypeRelations.MainSequence()
        for secondary in secondary_names:
            star_data = StarData.GetData(secondary)
            spt = star_data.spectype[0] + re.search('[0-9]\.*[0-9]*', star_data.spectype).group()
            T_sec = MS.Interpolate(MS.Temperature, spt)
            secondary_to_temperature[secondary] = T_sec

    elif method.lower() == 'excel':
        table = pd.read_excel(filename, 0)
        for secondary in secondary_names:
            T_sec = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())]['Literature_Temp'].item()
            T_error = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())][
                'Literature_error'].item()
            secondary_to_temperature[secondary] = T_sec
            secondary_to_error[secondary] = T_error

    df['Tactual'] = df['Secondary'].map(lambda s: secondary_to_temperature[s])
    df['Tact_err'] = df['Secondary'].map(lambda s: secondary_to_error[s])
    return 
Example 19
Project: gullikson-scripts   Author: kgullikson88   File: CCF_Systematics.py    (license) View Source Project 5 votes vote down vote up
def fit_sigma(df, i):
    """
    Find the largest allowable standard deviation, given the possible values Tactual can take.
    """
    Tmeasured, Tactual, _, _ = get_values(df)
    Tm = Tmeasured[i]
    
    # Get the possible values, and bin those with this measured value
    possible_values = sorted(pd.unique(df.Tactual))
    edges = [(possible_values[i] + possible_values[i+1])/2 for i in range(len(possible_values)-1)]
    bins = [0] + edges + [9e9]
    good = df.loc[df.Temperature == Tm]
    values, _= np.histogram(good.Tactual.values, bins=bins)
    
    mean = np.mean(good.Tactual.values)
    std = np.std(good.Tactual.values, ddof=1)
    if std > 0:
        return std
    
    sigma_test = np.arange(500, 10, -10) #Just test a bunch of values
    idx = np.searchsorted(bins, mean)
    idx = np.argmin(abs(np.array(bins) - mean))
    x1 = bins[idx-2] if idx > 2 else -1
    x2 = bins[idx-1]
    x3 = bins[idx]
    x4 = bins[idx+1] if idx < len(bins)-2 else np.inf
    N = len(good)
    probs = [get_probability(x1, x2, x3, x4, N, mean, s) for s in sigma_test]
    for s, p in zip(sigma_test, probs):
        if p > 0.5:
            return s
    
    # If we get here, just return a guess value
    return 200.0

    #raise ValueError('No probability > 0!') 
Example 20
Project: gullikson-scripts   Author: kgullikson88   File: Sensitivity.py    (license) View Source Project 5 votes vote down vote up
def read_hdf5(hdf5_file):
    """
    Reads the hdf5 file into a dataframe. Assumes a very specific format!

    Parameters:
    ===========
    - hdf5_file:   string
                   The full path to the hdf5 file.

    Returns
    ========
    A pandas DataFrame containing summary information
    """
    logging.info('Reading HDF5 file {}'.format(hdf5_file))
    hdf5_int = HDF5_Interface(hdf5_file)
    df = hdf5_int.to_df()


    # Get the contrast. Split by group and then merge to limit the amount of calculation needed
    logging.info('Estimating the V-band contrast ratio for each trial')
    test_vsini = df.vsini.unique()[0]
    temp = df.loc[(df.rv == 0) & (df.vsini == test_vsini)].drop_duplicates(subset=['star', 'temperature'])
    temp['contrast'] = temp.apply(lambda r: get_contrast(r, band='V'), axis=1)

    logging.info('Estimating the luminosity ratio for each trial')
    temp['lum_ratio'] = temp.apply(get_luminosity_ratio, axis=1)

    logging.info('Re-merging dataframe')
    df = pd.merge(df, temp[['star', 'temperature', 'contrast', 'lum_ratio']], on=['star', 'temperature'], how='left')
    df['logL'] = np.log10(df.lum_ratio)

    return df 
Example 21
Project: gullikson-scripts   Author: kgullikson88   File: Sensitivity.py    (license) View Source Project 5 votes vote down vote up
def parse_input(inp, sort_output=True, ensure_unique=True):
    """
    Parse the user input to get a list of integers.

    Parameters:
    ===========
    - inp:           string
                     Can be in the form 'a-b', 'a,b,c', 'a-b,c-d', etc.
                     '-' means an inclusive list of every number between a and b
                     ',' means the numbers a and b

    - sort_output:   boolean
                     Sort the output integers?

    - ensure_unique: boolean
                     Make sure the final list has no repeats?
    :return: A list of integers
    """
    sublists = inp.split(',')
    final_list = []
    for l in sublists:
        if '-' in l:
            first, last = l.split('-')
            for i in range(int(first), int(last) + 1):
                final_list.append(i)
        else:
            final_list.append(int(l))

    if ensure_unique:
        final_list = pd.unique(final_list)
    if sort_output:
        final_list = sorted(final_list)
    return final_list 
Example 22
Project: gullikson-scripts   Author: kgullikson88   File: Analyze_CCF.py    (license) View Source Project 5 votes vote down vote up
def get_ccf(self, params, df=None):
        """
        Get the ccf with the given parameters.

        Parameters:
        ===========
        - params:    dictionary:
                     All the parameters necessary to define a single ccf. This should be
                     a python dictionary with the keys:
                         - 'starname': The name of the star. Try self.list_stars() for the options.
                         - 'date': The UT date of the observations. Try self.list_dates() for the options.
                         - 'T': temperature of the model
                         - 'logg': the log(g) of the model
                         - 'vsini': the vsini by which the model was broadened before correlation
                         - '[Fe/H]': the metallicity of the model
                         - 'addmode': The way the order CCFs were added to make a total one. Can be:
                             - 'simple'
                             - 'ml'
                             - 'weighted'
                             - 'dc'


        - df:        a pandas DataFrame such as outputted by _compile_data

        Returns:
        ========
        -ccf:        pandas DataFrame
                     Holds columns of velocity and CCF power
        """
        if df is None:
            try:
                df = self._compile_data(params['starname'], params['date'])
            except KeyError:
                raise KeyError('Must give get_ccf params with starname and date keywords, if df is not given!')

        Tvals = df['T'].unique()
        T = Tvals[np.argmin(abs(Tvals - params['T']))]
        good = df.loc[(df['T'] == T) & (df.logg == params['logg']) & (df.vsini == params['vsini']) \
                      & (df['[Fe/H]'] == params['[Fe/H]']) & (df.addmode == params['addmode'])]

        return pd.DataFrame(data={'velocity': self.velocities, 'CCF': good['ccf'].item()}) 
Example 23
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 5 votes vote down vote up
def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray) 
Example 24
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 5 votes vote down vote up
def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray) 
Example 25
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 5 votes vote down vote up
def test_object_refcount_bug(self):
        lst = ['A', 'B', 'C', 'D', 'E']
        for i in range(1000):
            len(algos.unique(lst)) 
Example 26
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 5 votes vote down vote up
def test_on_index_object(self):

        mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
            np.arange(5), 5)])
        expected = mindex.values
        expected.sort()

        mindex = mindex.repeat(2)

        result = pd.unique(mindex)
        result.sort()

        tm.assert_almost_equal(result, expected) 
Example 27
Project: PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda   Author: SignalMedia   File: test_algos.py    (license) View Source Project 5 votes vote down vote up
def test_unique_label_indices():
    from pandas.hashtable import unique_label_indices

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right)

    a[np.random.choice(len(a), 10)] = -1
    left = unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right) 
Example 28
Project: plydata   Author: has2k1   File: types.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, data=None, groups=None, **kwargs):
        super().__init__(data=data, **kwargs)
        if groups is not None:
            self.plydata_groups = list(pd.unique(groups)) 
Example 29
Project: plydata   Author: has2k1   File: one_table.py    (license) View Source Project 5 votes vote down vote up
def _n_distinct(arr):
    """
    Number of unique values in array
    """
    return len(pd.unique(arr)) 
Example 30
Project: serenata-toolbox   Author: datasciencebr   File: test_chamber_of_deputies_dataset.py    (license) View Source Project 5 votes vote down vote up
def test_clean_2017_reimbursements(self):
        copy(os.path.join(self.fixtures_path, 'reimbursements-2017.xz'), self.path)
        file_path = os.path.join(self.path, 'reimbursements.xz')

        self.subject.clean()

        assert(os.path.exists(file_path))

        dataset = pd.read_csv(file_path, compression='xz')
        all_subquotas = [subquota[1] for subquota in self.subject.subquotas]

        present_subquotas = pd.unique(dataset['subquota_description'])
        for subquota in present_subquotas:
            with self.subTest():
                assert(subquota in all_subquotas) 
Example 31
Project: KaggleExeter   Author: detomo   File: build_features.py    (license) View Source Project 5 votes vote down vote up
def app_activity_features():
	train = pd.read_csv("gender_age_train.csv")
	test = pd.read_csv("gender_age_test.csv")
	train.drop(['gender','age','group'],axis=1,inplace=True)
	data = train.append(test)

	""" Merge with brand_model table"""
	device_table = pd.read_csv("phone_brand_device_model.csv")
	data = pd.merge(data,device_table,how='left',on='device_id')
	data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
	del device_table
	print "data build"
	"""
	Create dataframe indicating for each device id, which app is present, and how much is it active
		- merge events and app_events on event_id
		- group by device_id and app_id, and take the mean of activity
	"""
	events = pd.read_csv("events.csv")
	events = events[events['device_id'].isin(list(data['device_id']))]
	apps = pd.read_csv("app_events.csv")
	apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
	apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
	del events
	print "events build"
	"""Reshape the dataframe so that each app is a new feature"""
	reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
	reshaped[list(pd.unique(apps['app_id']))]=0

	for app in list(pd.unique(apps['app_id'])):
		sliced = apps[apps['app_id']==app]
		reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
	del apps
	return reshaped



########################################################################################################################################
######################################################################################################################################## 
Example 32
Project: KaggleExeter   Author: detomo   File: test_submission.py    (license) View Source Project 5 votes vote down vote up
def app_activity_features():
	train = pd.read_csv("gender_age_train.csv")
	test = pd.read_csv("gender_age_test.csv")
	train.drop(['gender','age','group'],axis=1,inplace=True)
	data = train.append(test)

	""" Merge with brand_model table"""
	device_table = pd.read_csv("phone_brand_device_model.csv")
	data = pd.merge(data,device_table,how='left',on='device_id')
	data = data.drop_duplicates()  #drop duplicates  #note: there is still one device associated with 2 brands/models
	del device_table
	print "data build"
	"""
	Create dataframe indicating for each device id, which app is present, and how much is it active
		- merge events and app_events on event_id
		- group by device_id and app_id, and take the mean of activity
	"""
	events = pd.read_csv("events.csv")
	events = events[events['device_id'].isin(list(data['device_id']))]
	apps = pd.read_csv("app_events.csv")
	apps = pd.merge(apps[['event_id','app_id','is_active']],events[['event_id','device_id']],on='event_id')
	apps = apps.groupby(['device_id','app_id'],as_index=False)['is_active'].mean()
	del events
	print "events build"
	"""Reshape the dataframe so that each app is a new feature"""
	reshaped = pd.DataFrame(columns=list(pd.unique(apps['app_id'])),index=list(pd.unique(apps['device_id'])))
	reshaped[list(pd.unique(apps['app_id']))]=0

	for app in list(pd.unique(apps['app_id'])):
		sliced = apps[apps['app_id']==app]
		reshaped[app].loc[list(sliced['device_id'])]=sliced['is_active'].values
	del apps
	return reshaped 
Example 33
Project: sample-cnn   Author: tae-jun   File: build_mtt.py    (license) View Source Project 5 votes vote down vote up
def _process_dataset(anno, sample_rate, n_samples, n_threads):
  """Processes, and saves MagnaTagATune dataset using multi-processes.

  Args:
    anno: Annotation DataFrame contains tags, mp3_path, split, and shard.
    sample_rate: Sampling rate of the audios. If the sampling rate is different 
      with an audio's original sampling rate, then it re-samples the audio.
    n_samples: Number of samples one segment contains.
    n_threads: Number of threads to process the dataset.
  """
  args_queue = Queue()
  split_and_shard_sets = pd.unique([tuple(x) for x in anno[['split', 'shard']].values])

  for split, shard in split_and_shard_sets:
    assigned_anno = anno[(anno['split'] == split) & (anno['shard'] == shard)]
    n_shards = anno[anno['split'] == split]['shard'].nunique()

    args = (assigned_anno, sample_rate, n_samples, split, shard, n_shards)
    args_queue.put(args)

  if FLAGS.n_threads > 1:
    threads = []
    for _ in range(FLAGS.n_threads):
      thread = Thread(target=_process_audio_files, args=[args_queue])
      thread.start()
      threads.append(thread)

    for thread in threads:
      thread.join()
  else:
    _process_audio_files(args_queue) 
Example 34
Project: calvin   Author: ucd-cws   File: postprocessor.py    (license) View Source Project 5 votes vote down vote up
def aggregate_regions(fp):

  # aggregate regions and supply portfolios
  # easier to do this with pandas by just reading the CSVs again
  sc = pd.read_csv(fp + '/shortage_cost.csv', index_col=0, parse_dates=True)
  sv = pd.read_csv(fp + '/shortage_volume.csv', index_col=0, parse_dates=True)
  flow = pd.read_csv(fp + '/flow.csv', index_col=0, parse_dates=True)
  demand_nodes = pd.read_csv('calvin/data/demand_nodes.csv', index_col = 0)
  portfolio = pd.read_csv('calvin/data/portfolio.csv', index_col = 0)

  for R in demand_nodes.region.unique():
    for t in demand_nodes.type.unique():
      ix = demand_nodes.index[(demand_nodes.region == R) & 
                              (demand_nodes.type == t)]
      sc['%s_%s' % (R,t)] = sc[ix].sum(axis=1)
      sv['%s_%s' % (R,t)] = sv[ix].sum(axis=1)

  for P in portfolio.region.unique():
    for k in portfolio.supplytype.unique():
      for t in portfolio.type.unique():
        ix = portfolio.index[(portfolio.region == P) & 
                             (portfolio.type ==t) & 
                             (portfolio.supplytype == k)]
        flow['%s_%s_%s' % (P,k,t)] = flow[ix].sum(axis=1)

  sc.to_csv(fp + '/shortage_cost.csv')
  sv.to_csv(fp + '/shortage_volume.csv')
  flow.to_csv(fp + '/flow.csv') 
Example 35
Project: calvin   Author: ucd-cws   File: calvin.py    (license) View Source Project 5 votes vote down vote up
def remove_debug_links(self):
    df = self.df
    ix = df.index[df.index.str.contains('DBUG')]
    df.drop(ix, inplace=True, axis=0)
    self.nodes = pd.unique(df[['i','j']].values.ravel()).tolist()
    self.links = list(zip(df.i,df.j,df.k))
    return df 
Example 36
Project: finch   Author: chrisranderson   File: data_prep.py    (license) View Source Project 5 votes vote down vote up
def nominal_to_numeric(array):
  mapper = {name: i for i, name in enumerate(pd.unique(array))}
  return np.array([mapper[name] for name in array]) 
Example 37
Project: echonet   Author: karoldvl   File: esc_original.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.bands = 60
        self.segment_length = 101

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()
        self._populate(self.validation_data)
        self._populate(self.test_data) 
Example 38
Project: skp_edu_docker   Author: TensorMSA   File: data_node_frame.py    (license) View Source Project 5 votes vote down vote up
def dataconf_eval_time_check(self, _wf_data_conf_node, _node_name):
        """
        data conf? ???, eval?? unique?? ????.
        :param data_dfconf_list (nn00001_1_dataconf_node)
        :return True:
        """
        _value = False
        if ('evaldata' in _node_name):
             _value = True
        return _value 
Example 39
Project: skp_edu_docker   Author: TensorMSA   File: data_node_frame.py    (license) View Source Project 5 votes vote down vote up
def set_dataconf_for_labels(self, df, label):
        """
        csv? ?? label? distict ?? ???
        Extract distinct label values
        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        #TODO : set_default_dataconf_from_csv ???? ?? ??
        label_values = pd.unique(df[label].values.ravel().astype('str')).tolist()
        return label_values 
Example 40
Project: stream2segment   Author: rizac   File: test_u_download.py    (license) View Source Project 5 votes vote down vote up
def test_get_events(self, mock_query):
        urlread_sideeffect = ["""1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
"""]
        
        
        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert only first two events events were successfully saved 
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        # AND data to save has length 2:
        assert len(data) == 2

        # now download again, with an url error:        
        urlread_sideeffect = [413, """1|2|3|4|5|6|7|8|9|10|11|12|13
20160508_0000129|2016-05-08 05:17:11.500000|40.57|52.23|60.0|AZER|EMSC-RTS|AZER|505483|ml|3.1|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|44.96|15.35|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|3.6|EMSC|CROATIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
20160508_0000113|2016-05-08 22:37:20.100000|45.68|26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
--- ERRROR --- THIS IS MALFORMED 20160508_abc0113|2016-05-08 22:37:20.100000| --- ERROR --- |26.64|163.0|BUC|EMSC-RTS|BUC|505351|ml|3.4|BUC|ROMANIA
""", URLError('blabla23___')]
        
        data = self.get_events_df(urlread_sideeffect, self.session, "http://eventws", db_bufsize=self.db_buf_size)
        # assert we got the same result as above:
        assert len(self.session.query(Event).all()) == len(pd.unique(data['id'])) == 2
        assert len(data) == 2
        # and since first response is 413, that having split the request into two, the
        # second response is our URLError (we could test it better, anyway):
        assert "blabla23___" in self.log_msg() 
Example 41
Project: seniority_list   Author: rubydatasystems   File: list_builder.py    (license) View Source Project 4 votes vote down vote up
def sort_eg_attributes(df, attributes=['doh', 'ldate'],
                       reverse_list=[0, 0],
                       add_columns=False):
    '''Sort master list attribute columns by employee group in preparation
    for list construction.  The overall master list structure and order is
    unaffected, only the selected attribute columns are sorted (normally
    date-related columns such as doh or ldate)

    inputs
        df
            The master data dataframe (does not need to be sorted)
        attributes
            columns to sort by eg (inplace)
        reverse_list
            If an attribute is to be sorted in reverse order (descending),
            use a '1' in the list position corresponding to the position of
            the attribute within the attributes input
        add_columns
            If True, an additional column for each sorted attribute will be
            added to the resultant dataframe, with the suffix '_sort' added
            to it.
    '''
    date_cols = []
    for col in df:
        if (df[col]).dtype == 'datetime64[ns]':
            date_cols.append(col)
    try:
        df.sort_values(['eg', 'eg_number'], inplace=True)
    except LookupError:
        df.sort_values(['eg', 'eg_order'], inplace=True)

    egs = df.eg.values
    i = 0
    for measure in attributes:
        data = df[measure].values
        measure_col = np.empty_like(data)
        for eg in pd.unique(df.eg):
            measure_slice = data[egs == eg]
            measure_slice_index = np.where(egs == eg)[0]
            measure_slice_sorted = np.sort(measure_slice, axis=0)

            if reverse_list[i]:
                measure_slice_invert = measure_slice_sorted[::-1]
                measure_slice_sorted = measure_slice_invert
            np.put(measure_col, measure_slice_index, measure_slice_sorted)

        if add_columns:
            col_name = measure + '_sort'
        else:
            col_name = measure

        df[col_name] = measure_col

        if measure in date_cols:
            df[col_name] = pd.to_datetime(df[col_name].dt.date)
        i += 1

    return df 
Example 42
Project: plydata   Author: has2k1   File: utils.py    (license) View Source Project 4 votes vote down vote up
def unique(lst):
    """
    Return unique elements

    :class:`pandas.unique` and :class:`numpy.unique` cast
    mixed type lists to the same type. They are faster, but
    some times we want to maintain the type.

    Parameters
    ----------
    lst : list-like
        List of items

    Returns
    -------
    out : list
        Unique items in the order that they appear in the
        input.

    Examples
    --------
    >>> import pandas as pd
    >>> import numpy as np
    >>> lst = ['one', 'two', 123, 'three']
    >>> pd.unique(lst)
    array(['one', 'two', '123', 'three'], dtype=object)
    >>> np.unique(lst)
    array(['123', 'one', 'three', 'two'],
          dtype='<U5')
    >>> unique(lst)
    ['one', 'two', 123, 'three']

    pandas and numpy cast 123 to a string!, and numpy does not
    even maintain the order.
    """
    seen = set()

    def make_seen(x):
        seen.add(x)
        return x

    return [make_seen(x) for x in lst if x not in seen] 
Example 43
Project: serenata-toolbox   Author: datasciencebr   File: test_chamber_of_deputies_dataset.py    (license) View Source Project 4 votes vote down vote up
def test_fetch_translate_clean_integration(self):
        self.subject.fetch()
        files = ["Ano-{}.csv".format(n) for n in [2017]]
        files.append('datasets-format.html')

        for name in files:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.translate()
        for name in ["reimbursements-{}.xz".format(n) for n in self.years]:
            file_path = os.path.join(self.path, name)
            assert(os.path.exists(file_path))

        self.subject.clean()
        file_path = os.path.join(self.path, 'reimbursements.xz')
        assert(os.path.exists(file_path))

        # test for subquota translation
        dataset = pd.read_csv(file_path, compression='xz')
        all_subquotas = ['Maintenance of office supporting parliamentary activity',
                     'Locomotion, meal and lodging',
                     'Fuels and lubricants',
                     'Consultancy, research and technical work',
                     'Publicity of parliamentary activity',
                     'Purchase of office supplies',
                     'Software purchase or renting; Postal services; Subscriptions',
                     'Security service provided by specialized company',
                     'Flight tickets',
                     'Telecommunication',
                     'Postal services',
                     'Publication subscriptions',
                     'Congressperson meal',
                     'Lodging, except for congressperson from Distrito Federal',
                     'Automotive vehicle renting or watercraft charter',
                     'Aircraft renting or charter of aircraft',
                     'Automotive vehicle renting or charter',
                     'Watercraft renting or charter',
                     'Taxi, toll and parking',
                     'Terrestrial, maritime and fluvial tickets',
                     'Participation in course, talk or similar event',
                     'Flight ticket issue']

        present_subquotas = pd.unique(dataset['subquota_description'])
        for subquota in present_subquotas:
            assert(subquota in all_subquotas) 
Example 44
Project: echonet   Author: karoldvl   File: esc.py    (license) View Source Project 4 votes vote down vote up
def __init__(self, data_dir, work_dir, train_folds, validation_folds, test_folds, esc10=False,
                 downsample=True):
        super().__init__(data_dir, work_dir)

        self.meta = pd.read_csv(data_dir + 'esc50.csv')

        self.train_folds = train_folds
        self.validation_folds = validation_folds
        self.test_folds = test_folds

        self.class_count = 50

        self.DOWNSAMPLE = downsample
        self.SEGMENT_LENGTH = 300
        self.BANDS = 180
        self.WITH_DELTA = False
        self.FMAX = 16000
        self.FFT = 2205
        self.HOP = 441

        self.esc10 = esc10
        if self.esc10:
            self.class_count = 10
            self.meta = self.meta[self.meta['esc10']]
            self.categories = pd.unique(self.meta.sort_values('target')['category'])
            self.meta['target'] = self.to_targets(self.meta['category'])
        else:
            self.categories = pd.unique(self.meta.sort_values('target')['category'])

        self.train_meta = self.meta[self.meta['fold'].isin(self.train_folds)]
        self.validation_data.meta = self.meta[self.meta['fold'].isin(self.validation_folds)]
        self.test_data.meta = self.meta[self.meta['fold'].isin(self.test_folds)]

        self._validation_size = len(self.validation_data.meta)
        self._test_size = len(self.test_data.meta)

        self._generate_spectrograms()

        if self.DOWNSAMPLE:
            self.SEGMENT_LENGTH //= 2
            self.BANDS //= 3

        self._populate(self.validation_data)
        self._populate(self.test_data) 
Example 45
Project: skp_edu_docker   Author: TensorMSA   File: data_node_frame.py    (license) View Source Project 4 votes vote down vote up
def set_dataconf_for_checktype(self, df, node_id, data_dfconf_list):
        """
        csv? ?? column type? ???? data_conf? ??(data_conf? ????? )
        ???? ??? Unique ? ?? ??? cell_feature_unique? ???(Keras?)
        
        :param wf_data_config, df, nnid, ver, node:
        :param conf_data:
        """
        try:
            #TODO : set_default_dataconf_from_csv ???? ?? ??
            data_conf = dict()
            data_conf_unique_v = dict()
            data_conf_col_unique_v = dict()
            data_conf_col_type = dict()
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            # Wdnn??? data_dfconf? ??? ??? ?? ??? ??? ?? ??? ??
            if len(data_dfconf_list) > 0:
                _wf_data_conf = wf_data_conf(data_dfconf_list)
                _cell_feature_unique = _wf_data_conf.cell_feature_unique if hasattr(_wf_data_conf,
                                                                      'cell_feature_unique') else list()  # ?? ???? ????? ??? ? ??? ??
            for i, v in df.dtypes.iteritems():
                # label
                column_dtypes = dict()
                column_unique_value = dict()
                if (str(v) in numerics):  # maybe need float
                    col_type = 'CONTINUOUS'
                    columns_unique_value = list()
                else:
                    col_type = 'CATEGORICAL'
                    columns_unique_value = pd.unique(df[i].fillna('').values.ravel()).tolist()  # null?? ???
                column_dtypes['column_type'] = col_type
                origin_feature_unique = _cell_feature_unique[i].get('column_u_values') if (i in _cell_feature_unique) else list()
                combined_col_u_list = utils.get_combine_label_list(origin_feature_unique, columns_unique_value)
                column_unique_value['column_u_values'] = combined_col_u_list    #???? ???? ?? ????.
                data_conf_col_type[i] = column_dtypes
                data_conf_col_unique_v[i] = column_unique_value
            data_conf['cell_feature'] = data_conf_col_type
            data_conf_unique_v['cell_feature_unique'] = data_conf_col_unique_v
            data_conf_json_str = json.dumps(data_conf)  #Json?? ???
            data_conf_json = json.loads(data_conf_json_str)
            data_conf_unique_json_str = json.dumps(data_conf_unique_v)
            data_conf_unique_json = json.loads(data_conf_unique_json_str)
            return data_conf_json, data_conf_unique_json
        except Exception as e:
            logging.error("set_dataconf_for_checktype {0} {1}".format(e, e.__traceback__.tb_lineno))