Python numpy.unique() Examples

The following are code examples for showing how to use numpy.unique(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: building-boundary   Author: Geodan   File: bounding_box.py    MIT License 6 votes vote down vote up
def compute_edge_angles(edges):
    """
    Compute the angles between the edges and the x-axis.

    Parameters
    ----------
    edges : (Mx2x2) array
        The coordinates of the sets of points that define the edges.

    Returns
    -------
    edge_angles : (Mx1) array
        The angles between the edges and the x-axis.
    """
    edges_count = len(edges)
    edge_angles = np.zeros(edges_count)
    for i in range(edges_count):
        edge_x = edges[i][1][0] - edges[i][0][0]
        edge_y = edges[i][1][1] - edges[i][0][1]
        edge_angles[i] = math.atan2(edge_y, edge_x)

    return np.unique(edge_angles) 
Example 2
Project: models   Author: kipoi   File: dataloader_m.py    MIT License 6 votes vote down vote up
def prepro_pos_table(pos_tables):
    """Extracts unique positions and sorts them."""
    if not isinstance(pos_tables, list):
        pos_tables = [pos_tables]

    pos_table = None
    for next_pos_table in pos_tables:
        if pos_table is None:
            pos_table = next_pos_table
        else:
            pos_table = pd.concat([pos_table, next_pos_table])
        pos_table = pos_table.groupby('chromo').apply(
            lambda df: pd.DataFrame({'pos': np.unique(df['pos'])}))
        pos_table.reset_index(inplace=True)
        pos_table = pos_table[['chromo', 'pos']]
        pos_table.sort_values(['chromo', 'pos'], inplace=True)
    return pos_table 
Example 3
Project: skylab   Author: coenders   File: ps_injector.py    GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, logE, logFlux, gamma, deg=4, **kwargs):
        # Make sure that energy bins are of increasing order.
        sorter = np.argsort(logE)
        energy = logE[sorter]
        flux = logFlux[sorter]

        # Make sure that energy bins contain only unique values.
        unique = np.argwhere(np.diff(energy) > 0.)
        energy = energy[unique]
        flux = flux[unique]

        self._spline = scipy.interpolate.InterpolatedUnivariateSpline(
            energy, flux, k=deg)

        # Use default energy range of flux parametrization.
        kwargs.setdefault("e_range", [10.**np.amin(logE), 10.**np.amax(logE)])

        super(ModelInjector, self).__init__(gamma, **kwargs) 
Example 4
Project: fuku-ml   Author: fukuball   File: RidgeRegression.py    MIT License 6 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        return self.W 
Example 5
Project: fuku-ml   Author: fukuball   File: KernelRidgeRegression.py    MIT License 6 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        return self.W 
Example 6
Project: fuku-ml   Author: fukuball   File: SupportVectorMachine.py    MIT License 6 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        return self.W 
Example 7
Project: fuku-ml   Author: fukuball   File: LinearRegression.py    MIT License 6 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        return self.W 
Example 8
Project: advent-of-code-2018   Author: badouralix   File: youyoun.py    MIT License 6 votes vote down vote up
def run(self, s):
        points = [[int(e.strip()) for e in x.split(",")] for x in s.splitlines()]
        w, h = max((p[0] for p in points)), max((p[1] for p in points))
        plan = np.zeros((h + 1, w + 1))
        for i, e in enumerate(points):
            plan[e[1], e[0]] = i + 1
        borders_points = {-1}
        for i in range(plan.shape[0]):
            for j in range(plan.shape[1]):
                distances = {}
                for k, e in enumerate(points):
                    dist = abs(i - e[0]) + abs(j - e[1])
                    distances[k] = dist
                min_dist = min(distances.values())
                close_points = {point for point, dist in distances.items() if dist == min_dist}
                if len(close_points) >= 2:
                    plan[i, j] = -1
                else:
                    point = close_points.pop()
                    plan[i, j] = point + 1
                    if i == 0 or i == plan.shape[1] or j == 0 or j == plan.shape[0]:
                        borders_points.add(point + 1)
        unique, counts = np.unique(plan, return_counts=True)
        point_counts = dict(zip(unique, counts))
        return max([c for p, c in point_counts.items() if p not in borders_points]) 
Example 9
Project: cvpr2018-hnd   Author: kibok90   File: test.py    MIT License 6 votes vote down vote up
def count_super(p, m, counters, preds, labels, label_to_ch):
    
    for l in np.unique(labels):
        preds_l = preds[labels == l]
        
        # in -> known
        if label_to_ch[l]:
            acc = np.zeros_like(preds_l, dtype=bool)
            for c in label_to_ch[l]:
                if p == 0: counters['data'][m][c] += preds_l.shape[0]
                acc |= (preds_l == c)
            acc_sum = acc.sum()
            for c in label_to_ch[l]:
                counters['acc'][p,m][c] += acc_sum
        
        # out -> novel
        else:
            if p == 0: counters['data'][m][-1] += preds_l.shape[0]
            acc_sum = (preds_l < 0).sum()
            counters['acc'][p,m][-1] += acc_sum 
Example 10
Project: pidforest   Author: vatsalsharan   File: subcube.py    MIT License 6 votes vote down vote up
def filter_pts(self, pts, indices):
        pts_ind = []
        list_ind = []
        self.dim, num_pts = np.shape(pts)
        for i in range(num_pts):
            in_cube = True
            for j in range(self.dim):
                if (pts[j][i] < self.start[j]) or (pts[j][i] >= self.end[j]):
                    in_cube = False
            if in_cube:
                pts_ind.append(i)
                list_ind.append(indices[i])
        self.points = pts[:, pts_ind]
        self.indices = list_ind
        self.num_pts = len(list_ind)
        self.val = {}
        self.count = {}
        for axis in range(self.dim):
            self.val[axis], self.count[axis] = np.unique(np.array(self.points[axis]), return_counts=True) 
Example 11
Project: pidforest   Author: vatsalsharan   File: forest.py    MIT License 6 votes vote down vote up
def fit(self, pts):
        self.points = pts
        self.dim, self.size = np.shape(self.points)
        if int(self.sample_axis*self.dim) == 0:
            print("sample_axis is too low")
            return
        self.start = np.zeros(self.dim)
        self.end = np.zeros(self.dim)
        for axis in range(self.dim):
            val = np.unique(np.array(self.points[axis]))
            if len(val) <= 1:
                print("No entropy in dimension :", axis)
                return
            self.start[axis] = (3 * val[0] - val[1]) / 2
            self.end[axis] = (3 * val[-1] - val[-2]) / 2
        k_args = {'depth': 0, 'forest': self}
        max_sample_size = np.min((self.size, self.max_depth*200))
        sample = np.random.choice(self.size, max_sample_size, replace=False)
        for i in range(self.n_trees):
            k_args['indices'] = np.random.choice(self.size, self.max_samples, replace=False)
            root_node = Node(**k_args)
            root_node.compute_density(sample)
            self.tree.append(root_node)
            self.n_leaves[i] = root_node.compute_leaf_num() 
Example 12
Project: pidforest   Author: vatsalsharan   File: forest.py    MIT License 6 votes vote down vote up
def __init__(self, node, indices):
        self.node = node
        self.indices = indices
        self.val = []
        self.count = []
        self.gap = []
        for axis in range(self.node.forest.dim):
            val, count = np.unique(np.array(self.node.forest.points[axis, self.indices]), return_counts=True)
            self.val.append(val)
            self.count.append(count)
            if len(val) <= 1:
                gap = [0]
            else:
                gap = np.zeros(len(val))
                gap[0] = (val[0] + val[1]) / 2 - self.node.cube.start[axis]
                gap[-1] = self.node.cube.end[axis] - (val[-1] + val[-2]) / 2
                for i in range(1, len(val) - 1):
                    gap[i] = (val[i + 1] - val[i - 1]) / 2
            self.gap.append(gap) 
Example 13
Project: pidforest   Author: vatsalsharan   File: new_forest.py    MIT License 6 votes vote down vote up
def fit(self, pts):
        self.points = pts
        self.dim, self.size = np.shape(self.points)
        if int(self.sample_axis * self.dim) == 0:
            print("sample_axis is too low")
            return
        self.start = np.zeros((self.dim, 1))
        self.end = np.zeros((self.dim, 1))
        for axis in range(self.dim):
            val = np.unique(np.array(self.points[axis]))
            if len(val) <= 1:
                print("No entropy in dimension :", axis)
                return
            self.start[axis] = (3 * val[0] - val[1]) / 2
            self.end[axis] = (3 * val[-1] - val[-2]) / 2
        k_args = {'depth': 0, 'forest': self}
        sample = np.random.choice(self.size, self.max_depth * 50, replace=False)
        for i in range(self.n_trees):
            k_args['indices'] = np.random.choice(self.size, self.max_samples, replace=False)
            root_node = Node(**k_args)
            root_node.compute_density(sample)
            self.tree.append(root_node) 
Example 14
Project: pidforest   Author: vatsalsharan   File: myforest.py    MIT License 6 votes vote down vote up
def fit(self, pts):
        self.points = pts
        self.dim, self.size = np.shape(self.points)
        if int(self.sample_axis*self.dim) == 0:
            print("sample_axis is too low")
            return
        self.start = np.zeros(self.dim)
        self.end = np.zeros(self.dim)
        for axis in range(self.dim):
            val = np.unique(np.array(self.points[axis]))
            if len(val) <= 1:
                print("No entropy in dimension :", axis)
                return
            self.start[axis] = (3 * val[0] - val[1]) / 2
            self.end[axis] = (3 * val[-1] - val[-2]) / 2
        k_args = {'depth': 0, 'forest': self}
        max_sample_size = np.min((self.size, self.max_depth*50))
        sample = np.random.choice(self.size, max_sample_size, replace=False)
        for i in range(self.n_trees):
            k_args['indices'] = np.random.choice(self.size, self.max_samples, replace=False)
            root_node = Node(**k_args)
            root_node.compute_density(self.points[:, sample])
            self.tree.append(root_node) 
Example 15
Project: pidforest   Author: vatsalsharan   File: myforest.py    MIT License 6 votes vote down vote up
def __init__(self, node, indices):
        self.node = node
        self.indices = indices
        self.val = []
        self.count = []
        self.gap = []
        for axis in range(self.node.forest.dim):
            val, count = np.unique(np.array(self.node.forest.points[axis, self.indices]), return_counts=True)
            self.val.append(val)
            self.count.append(count)
            if len(val) <= 1:
                gap = [0]
            else:
                gap = np.zeros(len(val))
                gap[0] = (val[0] + val[1]) / 2 - self.node.cube.start[axis]
                gap[-1] = self.node.cube.end[axis] - (val[-1] + val[-2]) / 2
                for i in range(1, len(val) - 1):
                    gap[i] = (val[i + 1] - val[i - 1]) / 2
            self.gap.append(gap) 
Example 16
Project: sumo   Author: ratan-lab   File: test_utils.py    MIT License 6 votes vote down vote up
def test_extract_ncut():
    samples = 10
    a = np.random.random((samples, samples))
    a = (a * a.T) / 2

    labels = utils.extract_ncut(a, 2)
    assert labels.size == samples
    assert np.unique(labels).size == 2

    labels = utils.extract_ncut(a, 4)
    assert labels.size == samples
    assert np.unique(labels).size == 4

    with pytest.raises(AssertionError):
        a[0, 1], a[1, 0] = 2, 1
        utils.extract_ncut(a, 4) 
Example 17
Project: transferlearning   Author: jindongwang   File: MEDA.py    MIT License 6 votes vote down vote up
def estimate_mu(self, _X1, _Y1, _X2, _Y2):
        adist_m = proxy_a_distance(_X1, _X2)
        C = len(np.unique(_Y1))
        epsilon = 1e-3
        list_adist_c = []
        for i in range(1, C + 1):
            ind_i, ind_j = np.where(_Y1 == i), np.where(_Y2 == i)
            Xsi = _X1[ind_i[0], :]
            Xtj = _X2[ind_j[0], :]
            adist_i = proxy_a_distance(Xsi, Xtj)
            list_adist_c.append(adist_i)
        adist_c = sum(list_adist_c) / C
        mu = adist_c / (adist_c + adist_m)
        if mu > 1:
            mu = 1
        if mu < epsilon:
            mu = 0
        return mu 
Example 18
Project: transferlearning   Author: jindongwang   File: BDA.py    MIT License 6 votes vote down vote up
def estimate_mu(_X1, _Y1, _X2, _Y2):
    adist_m = proxy_a_distance(_X1, _X2)
    C = len(np.unique(_Y1))
    epsilon = 1e-3
    list_adist_c = []
    for i in range(1, C + 1):
        ind_i, ind_j = np.where(_Y1 == i), np.where(_Y2 == i)
        Xsi = _X1[ind_i[0], :]
        Xtj = _X2[ind_j[0], :]
        adist_i = proxy_a_distance(Xsi, Xtj)
        list_adist_c.append(adist_i)
    adist_c = sum(list_adist_c) / C
    mu = adist_c / (adist_c + adist_m)
    if mu > 1:
        mu = 1
    if mu < epsilon:
        mu = 0
    return mu 
Example 19
Project: sparse-subspace-clustering-python   Author: abhinav4192   File: BestMap.py    MIT License 6 votes vote down vote up
def BestMap(L1, L2):

    L1 = L1.flatten(order='F').astype(float)
    L2 = L2.flatten(order='F').astype(float)
    if L1.size != L2.size:
        sys.exit('size(L1) must == size(L2)')
    Label1 = np.unique(L1)
    nClass1 = Label1.size
    Label2 = np.unique(L2)
    nClass2 = Label2.size
    nClass = max(nClass1, nClass2)

    # For Hungarian - Label2 are Workers, Label1 are Tasks.
    G = np.zeros([nClass, nClass]).astype(float)
    for i in range(0, nClass2):
        for j in range(0, nClass1):
            G[i, j] = np.sum(np.logical_and(L2 == Label2[i], L1 == Label1[j]))

    c = Hungarian(-G)
    newL2 = np.zeros(L2.shape)
    for i in range(0, nClass2):
        newL2[L2 == Label2[i]] = Label1[c[i]]
    return newL2 
Example 20
Project: rhodonite   Author: nestauk   File: misc.py    MIT License 5 votes vote down vote up
def get_aggregate_vp(g, vp, vp_grouper, agg=None):
    """aggregate_property_map
    
    Parameters
    ----------
        g : :obj:`graph_tool.Graph` 
            A graph.
        vp : :obj:`str`
            String representing an internal property map of graph, g.
        vp_grouper : :obj:`str` 
            String representing name of an internal property map that will be 
            used to group by.
        agg : :obj:`function` 
            Function to aggregate by. For example, min, max, sum, numpy.mean, 
            etc.
    Returns
    -------
        :obj:`iter` of :obj:`float` 
            Aggregated values from x. 
    """
    vp_vals = get_vp_values(g, vp)
    vp_agg = get_vp_values(g, vp_grouper)
    
    sid_x = vp_agg.argsort()
    # Get where the sorted version of base changes groups
    split_idx = np.flatnonzero(np.diff(vp_agg[sid_x]) > 0) + 1
    # OR np.unique(base[sidx],return_index=True)[1][1:]

    # Finally sort inp based on the sorted indices and split based on split_idx
    vp_vals_grouped = np.split(vp_vals[sid_x], split_idx)
    
    x = sorted(set(vp_agg))
    if agg: 
        y = [agg(vvg) for vvg in vp_vals_grouped]
    else:
        y = vp_vals_grouped

    return x, y 
Example 21
Project: Collaborative-Learning-for-Weakly-Supervised-Object-Detection   Author: Sunarker   File: ds_utils.py    MIT License 5 votes vote down vote up
def unique_boxes(boxes, scale=1.0):
  """Return indices of unique boxes."""
  v = np.array([1, 1e3, 1e6, 1e9])
  hashes = np.round(boxes * scale).dot(v)
  _, index = np.unique(hashes, return_index=True)
  return np.sort(index) 
Example 22
Project: prediction-constrained-topic-models   Author: dtak   File: util_io_training.py    MIT License 5 votes vote down vote up
def calc_laps_when_snapshots_saved(
        n_batches=1,
        n_laps=1,
        return_str=False,
        n_keep_left=5,
        n_keep_right=5,
        **alg_state_kwargs):
    alg_state_kws = dict(**alg_state_kwargs)
    alg_state_kws['cur_step'] = 0
    alg_state_kws['n_batches'] = int(n_batches)
    alg_state_kws['n_laps'] = float(n_laps)
    alg_state_kws = init_alg_state_kwargs(**alg_state_kws)
    n_steps = alg_state_kws['n_steps']
    do_save_laps = list()
    do_save_steps = list()
    for step_id in range(0, n_steps + 1):
        if do_save_now(**alg_state_kws):
            do_save_laps.append(alg_state_kws['cur_lap'])
            do_save_steps.append(alg_state_kws['cur_step'])
        alg_state_kws = update_alg_state_kwargs(**alg_state_kws)
    if n_keep_left > 0 and n_keep_right > 0:
        do_save_laps = np.unique(np.hstack([
            do_save_laps[:n_keep_left],
            do_save_laps[-n_keep_right:]])).tolist()
        do_save_steps = np.unique(np.hstack([
            do_save_steps[:n_keep_left],
            do_save_steps[-n_keep_right:]])).tolist()
        if len(do_save_steps) > n_keep_left:
            do_save_steps.insert(n_keep_left, -1)
            do_save_laps.insert(n_keep_left, -1)
    if return_str:
        steps_str = ','.join(['%6d' % a for a in do_save_steps]).replace('-1,', '...')
        laps_str  = ','.join(['%6.3g' % a for a in do_save_laps]).replace('-1,', '...')
        return laps_str, steps_str
    else:
        return do_save_laps, do_save_steps 
Example 23
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 5 votes vote down vote up
def make_best_task_df(
        df,
        target_query="SPLIT_NAME == 'VALID' and LAP > 50",
        score_colname='Y_ERROR_RATE',
        score_ranking_func=np.argmin,
        default_score=None,
        verbose=False):
    ''' Find best task for each unique job in provided df.

    Returns
    -------
    best_df : dataframe of best tasks for each unique job
    '''
    if default_score is None:
        default_score = fetch_default_score(score_ranking_func.__name__)
    best_task_df_list = list()
    job_paths = np.unique(df['JOB_PATH'].values)
    for job_path in job_paths:
        if job_path is None:
            continue
        job_df = df.query("JOB_PATH == '%s'" % job_path)
        taskids = np.unique(job_df['TASKID'].values)
        best_score_idx = np.zeros_like(taskids, dtype=np.int32)
        best_score = default_score * np.ones_like(taskids, dtype=np.float64)
        for tt, taskidstr in enumerate(taskids):
            task_df = job_df.query(target_query + " and TASKID == '%s'" % taskidstr)
            if task_df.shape[0] < 1:
                continue
            if not np.all(np.isfinite(task_df[score_colname].values)):
                pprint(task_df[score_colname].values)
            best_score_idx[tt] = score_ranking_func(task_df[score_colname].values)
            best_score[tt] = task_df[score_colname].values[best_score_idx[tt]]
        best_task_idx = score_ranking_func(best_score)
        best_task_df = job_df.query("TASKID == '%s'" % taskids[best_task_idx])
        best_task_df_list.append(best_task_df)
        if verbose:
            pprint(job_path)
            pprint("best task: %s" % best_task_idx)
    return pd.concat(best_task_df_list) 
Example 24
Project: FasterRCNN_TF_Py3   Author: upojzsb   File: ds_utils.py    MIT License 5 votes vote down vote up
def unique_boxes(boxes, scale=1.0):
    """Return indices of unique boxes."""
    v = np.array([1, 1e3, 1e6, 1e9])
    hashes = np.round(boxes * scale).dot(v)
    _, index = np.unique(hashes, return_index=True)
    return np.sort(index) 
Example 25
Project: ML_from_scratch   Author: jarfa   File: RegressionTree.py    Apache License 2.0 5 votes vote down vote up
def find_potential_splits(data, p=0.05):
    splits = np.percentile(data, 100 * np.arange(p, 1.0, p), axis=0)
    return dict(
        (c, np.unique(splits[:, c])) for c in range(splits.shape[1])
    ) 
Example 26
Project: models   Author: kipoi   File: gtf_utils.py    MIT License 5 votes vote down vote up
def get_all_exons(self):
        exons = np.vstack([i.exons for i in self.trans])
        exons = np.unique(exons, axis=0)
        ind = np.lexsort((exons[:,1],exons[:,0]))
        if self.strand == '-':
            ind = ind[::-1]
        exons = exons[ind]
        return exons 
Example 27
Project: models   Author: kipoi   File: gtf_utils.py    MIT License 5 votes vote down vote up
def get_all_introns(self):
        for j in range(len(self.trans)):
            self.trans[j].add_introns()
        introns = np.vstack([i.introns for i in self.trans])
        introns = np.unique(introns, axis=0)
        ind = np.lexsort((introns[:,1],introns[:,0]))
        if self.strand == '-':
            ind = ind[::-1]
        introns = introns[ind]
        return introns 
Example 28
Project: models   Author: kipoi   File: gtf_utils.py    MIT License 5 votes vote down vote up
def get_all_exons(self):
        exons = np.vstack([i.exons for i in self.trans])
        exons = np.unique(exons, axis=0)
        ind = np.lexsort((exons[:,1],exons[:,0]))
        if self.strand == '-':
            ind = ind[::-1]
        exons = exons[ind]
        return exons 
Example 29
Project: models   Author: kipoi   File: gtf_utils.py    MIT License 5 votes vote down vote up
def get_all_introns(self):
        for j in range(len(self.trans)):
            self.trans[j].add_introns()
        introns = np.vstack([i.introns for i in self.trans])
        introns = np.unique(introns, axis=0)
        ind = np.lexsort((introns[:,1],introns[:,0]))
        if self.strand == '-':
            ind = ind[::-1]
        introns = introns[ind]
        return introns 
Example 30
Project: models   Author: kipoi   File: gtf_utils.py    MIT License 5 votes vote down vote up
def get_all_introns(self):
        for j in range(len(self.trans)):
            self.trans[j].add_introns()
        introns = np.vstack([i.introns for i in self.trans])
        introns = np.unique(introns, axis=0)
        ind = np.lexsort((introns[:,1],introns[:,0]))
        if self.strand == '-':
            ind = ind[::-1]
        introns = introns[ind]
        return introns 
Example 31
Project: NiBetaSeries   Author: HBClab   File: nistats.py    MIT License 5 votes vote down vote up
def _lss_events_iterator(events_file):
    """Make a model for each trial using least squares separate (LSS)

    Parameters
    ----------
    events_file : str
        File that contains all events from the bold run

    Yields
    ------
    events_trial : DataFrame
        A DataFrame in which the target trial maintains its trial type,
        but all other trials are assigned to 'other'
    trial_type : str
        The trial_type of the target trial
    trial_counter : int
        The marker for the nth trial of that type
    """

    import pandas as pd
    import numpy as np
    events = pd.read_csv(events_file, sep='\t')
    trial_counter = dict([(t, 0) for t in np.unique(events['trial_type'])])
    for trial_id in range(len(events)):
        trial_type = events.loc[trial_id, 'trial_type']
        # make a copy of the dataframe
        events_trial = events.copy()
        # assign new name to all events from original condition
        trial_type_id = events_trial['trial_type'] == trial_type
        events_trial.loc[trial_type_id, 'trial_type'] = 'other'
        # assign the trial of interest to be its original value
        events_trial.loc[trial_id, 'trial_type'] = trial_type
        yield events_trial, trial_type, trial_counter[trial_type]
        trial_counter[trial_type] += 1 
Example 32
Project: mlearn   Author: materialsvirtuallab   File: nnp.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def write_cfgs(self, filename, cfg_pool):
        """
        Write the formatted configuration file.

        Args:
            filename (str): The filename to be written.
            cfg_pool (list): The configuration pool contains
                structure and energy/forces properties.
        """
        lines = []
        for dataset in cfg_pool:
            if isinstance(dataset['structure'], dict):
                structure = Structure.from_dict(dataset['structure'])
            else:
                structure = dataset['structure']
            energy = dataset['outputs']['energy']
            forces = dataset['outputs']['forces']
            virial_stress = dataset['outputs']['virial_stress']

            lines.append(self._line_up(structure, energy, forces, virial_stress))

            # dist = np.unique(structure.distance_matrix.ravel())[1]
            # if self.shortest_distance > dist:
            #     self.shortest_distance = dist

        self.specie = Element(structure.symbol_set[0])

        with open(filename, 'w') as f:
            f.write('\n'.join(lines))

        return filename 
Example 33
Project: core   Author: lifemapper   File: layer_encoder.py    GNU General Public License v3.0 5 votes vote down vote up
def _get_largest_class_method(min_coverage, nodata):
    """Gets the function to use for determining the largest class

    Args:
        min_coverage: The minimum percentage of the data window that must be
            covered by the largest class.
        nodata: This value is assumed to be nodata in the array
    """
    if min_coverage > 1.0:
        min_coverage = min_coverage / 100.0
    # ...............................
    def get_largest_class(window):
        min_num = min_coverage * window.size
        largest_count = 0
        largest_class = nodata
        unique_values = np.column_stack(np.unique(window, return_counts=True))
        for cl, num in unique_values:
            if not np.isclose(cl, nodata) and num > largest_count \
                    and num > min_num:
                largest_class = cl
        return largest_class

    # ...............................
    def get_largest_class_1_8(window):
        """Get largest class for numpy 1.8
        """
        min_num = min_coverage * window.size
        largest_count = 0
        largest_class = nodata
        unique_values = np.unique(window)
        for cl in unique_values:
            num = np.where(window == cl)[0].size
            if not np.isclose(cl, nodata) and num > largest_count \
                    and num > min_num:
                largest_class = cl
        return largest_class
    return get_largest_class_1_8

# ............................................................................. 
Example 34
Project: fuku-ml   Author: fukuball   File: PLA.py    MIT License 5 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        if mode == 'linear_regression_accelerator':
            accelerator = linear_regression.Accelerator()
            for class_item in self.class_list:
                modify_X, modify_Y = utility.DatasetLoader.modify_XY(self.train_X, self.train_Y, class_item)
                self.temp_train_X = self.train_X
                self.temp_train_Y = self.train_Y
                self.train_X = modify_X
                self.train_Y = modify_Y
                self.temp_data_num = self.data_num
                self.data_num = len(self.train_Y)
                self.temp_W = self.W
                self.W = self.temp_W[class_item]
                self.temp_W[class_item] = accelerator.init_W(self)
                self.train_X = self.temp_train_X
                self.train_Y = self.temp_train_Y
                self.temp_train_X = []
                self.temp_train_Y = []
                self.data_num = self.temp_data_num
                self.temp_data_num = 0
                self.W = self.temp_W
                self.temp_W = {}

        return self.W 
Example 35
Project: fuku-ml   Author: fukuball   File: PocketPLA.py    MIT License 5 votes vote down vote up
def init_W(self, mode='normal'):

        self.W = {}

        if (self.status != 'load_train_data') and (self.status != 'train'):
            print("Please load train data first.")
            return self.W

        self.status = 'init'

        self.data_num = len(self.train_Y)
        self.data_demension = len(self.train_X[0])
        self.class_list = list(itertools.combinations(np.unique(self.train_Y), 2))

        for class_item in self.class_list:
            self.W[class_item] = np.zeros(self.data_demension)

        if mode == 'linear_regression_accelerator':
            accelerator = linear_regression.Accelerator()
            for class_item in self.class_list:
                modify_X, modify_Y = utility.DatasetLoader.modify_XY(self.train_X, self.train_Y, class_item)
                self.temp_train_X = self.train_X
                self.temp_train_Y = self.train_Y
                self.train_X = modify_X
                self.train_Y = modify_Y
                self.temp_data_num = self.data_num
                self.data_num = len(self.train_Y)
                self.temp_W = self.W
                self.W = self.temp_W[class_item]
                self.temp_W[class_item] = accelerator.init_W(self)
                self.train_X = self.temp_train_X
                self.train_Y = self.temp_train_Y
                self.temp_train_X = []
                self.temp_train_Y = []
                self.data_num = self.temp_data_num
                self.temp_data_num = 0
                self.W = self.temp_W
                self.temp_W = {}

        return self.W 
Example 36
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: metric.py    Apache License 2.0 5 votes vote down vote up
def update_binary_stats(self, label, pred):
        """
        Update various binary classification counts for a single (label, pred)
        pair.

        Parameters
        ----------
        label : `NDArray`
            The labels of the data.

        pred : `NDArray`
            Predicted values.
        """
        pred = pred.asnumpy()
        label = label.asnumpy().astype('int32')
        pred_label = numpy.argmax(pred, axis=1)

        check_label_shapes(label, pred)
        if len(numpy.unique(label)) > 2:
            raise ValueError("%s currently only supports binary classification."
                             % self.__class__.__name__)
        pred_true = (pred_label == 1)
        pred_false = 1 - pred_true
        label_true = (label == 1)
        label_false = 1 - label_true

        self.true_positives += (pred_true * label_true).sum()
        self.false_positives += (pred_true * label_false).sum()
        self.false_negatives += (pred_false * label_true).sum()
        self.true_negatives += (pred_false * label_false).sum() 
Example 37
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: test_random.py    Apache License 2.0 5 votes vote down vote up
def test_unique_zipfian_generator():
    ctx = mx.context.current_context()
    if ctx.device_type == 'cpu':
        num_sampled = 8192
        range_max = 793472
        batch_size = 4
        op = mx.nd._internal._sample_unique_zipfian
        classes, num_trials = op(range_max, shape=(batch_size, num_sampled))
        for i in range(batch_size):
            num_trial = num_trials[i].asscalar()
            # test uniqueness
            assert np.unique(classes[i].asnumpy()).size == num_sampled
            # test num trials. reference count obtained from pytorch implementation
            assert num_trial > 14500
            assert num_trial < 17000 
Example 38
Project: FCOS_GluonCV   Author: DetectionTeamUCAS   File: cityscapes.py    Apache License 2.0 5 votes vote down vote up
def _class_to_index(self, mask):
        # assert the values
        values = np.unique(mask)
        for value in values:
            assert(value in self._mapping)
        index = np.digitize(mask.ravel(), self._mapping, right=True)
        return self._key[index].reshape(mask.shape) 
Example 39
Project: cvpr2018-hnd   Author: kibok90   File: samplers.py    MIT License 5 votes vote down vote up
def balanced_shuffle(labels, num_epochs=50, path=None, start_time=time.time()):

    order_path = '{path}/balanced_order_{num_epochs}.h5' \
                       .format(path=path, num_epochs=num_epochs)
    if path is not None and os.path.isfile(order_path):
        with h5py.File(order_path, 'r') as f:
            order = f['order'][:]
    else:
        evenness = 5 # batch_size | evenness*num_classes
        classes = np.unique(labels.numpy())
        num_classes = len(classes)
        loc_data_per_class = [np.argwhere(labels.numpy() == k).flatten() for k in classes]
        num_data_per_class = [(labels.numpy() == k).sum() for k in classes]
        max_data_per_class = max(num_data_per_class)
        num_loc_split = (max_data_per_class // evenness) * np.ones(evenness, dtype=int)
        num_loc_split[:(max_data_per_class % evenness)] += 1
        loc_split = [0]
        loc_split.extend(np.cumsum(num_loc_split).tolist())
        order = -np.ones([num_epochs, max_data_per_class*num_classes], dtype=int)
        for epoch in range(num_epochs):
            order_e = -np.ones([max_data_per_class, num_classes], dtype=int)
            for k in classes:
                loc_k = np.random.permutation(loc_data_per_class[k])
                for i in range(evenness):
                    loc_i = loc_k[loc_split[i]:loc_split[i+1]]
                    order_e[i:(len(loc_i)*evenness+i):evenness, k] = loc_i
            order[epoch] = order_e.flatten()
            print_freq = min([100, (num_epochs-1) // 5 + 1])
            print_me = (epoch == 0 or epoch == num_epochs-1 or (epoch+1) % print_freq == 0)
            if print_me:
                print('{epoch:4d}/{num_epochs:4d} e; '.format(epoch=epoch+1, num_epochs=num_epochs), end='')
                print('generate balanced random order; {time:8.3f} s'.format(time=time.time()-start_time))
        
        if path is not None:
            with h5py.File(order_path, 'w') as f:
                f.create_dataset('order', data=order, compression='gzip', compression_opts=9)
    
    print('balanced random order; {time:8.3f} s'.format(time=time.time()-start_time))
    return torch.from_numpy(order) 
Example 40
Project: cvpr2018-hnd   Author: kibok90   File: test.py    MIT License 5 votes vote down vote up
def count_test(p, counters, preds, labels, T, hierarchical_measure=False):

    label_hnd = T['label_hnd']
    
    if hierarchical_measure:
        HP_mat = T['HP_mat']
        HF_mat = T['HF_mat']
        dist_mat = T['dist_mat']
    
    for l in np.unique(labels.cpu().numpy()):
        preds_l = preds[(labels == int(l)).cpu().numpy().astype(bool)]
        acc = np.zeros_like(preds_l, dtype=bool)
        if hierarchical_measure:
            HE = MAX_DIST*np.ones_like(preds_l, dtype=int)
            HP, HR, HF = np.zeros_like(preds_l), np.zeros_like(preds_l), np.zeros_like(preds_l)
        for c in label_hnd[l]:
            acc |= (preds_l == c)
            if hierarchical_measure:
                HE = np.minimum(HE, dist_mat[preds_l, c])
                HP = np.maximum(HP, HP_mat[preds_l, c])
                HR = np.maximum(HR, HP_mat[c, preds_l])
                HF = np.maximum(HF, HF_mat[preds_l, c])
        
        if p == 0: counters['data'][l] += preds_l.shape[0]
        counters['acc'][p,l] += acc.sum()
        if hierarchical_measure:
            counters['HE'][p,l] += HE.sum()
            counters['HP'][p,l] += HP.sum()
            counters['HR'][p,l] += HR.sum()
            counters['HF'][p,l] += HF.sum() 
Example 41
Project: pypriv   Author: soeaver   File: visualize.py    MIT License 5 votes vote down vote up
def draw_mask(im, label, color_map, alpha=0.7):
    h, w = im.shape[:2]
    color = im.astype(np.float32, copy=True)
    # color = np.zeros((h, w, 3), dtype=np.uint8)

    category = np.unique(label)

    for c in list(category):
        color[np.where(label == c)] = color_map[c]

    mask = Image.blend(Image.fromarray(im), Image.fromarray(color), alpha)

    return np.array(mask) 
Example 42
Project: gullikson-scripts   Author: kgullikson88   File: StellarModel.py    MIT License 5 votes vote down vote up
def __init__(self, parameter_list):
        self.parameter_list = np.unique(parameter_list)
        self.index_interpolator = interp1d(self.parameter_list, np.arange(len(self.parameter_list)), kind='linear')
        pass 
Example 43
Project: IntroToDeepLearning   Author: robb-brown   File: input_data.py    MIT License 5 votes vote down vote up
def __init__(self, images, labels, fake_data=False):
    if fake_data:
      self._num_examples = 10000
    else:
      assert images.shape[0] == labels.shape[0], (
          "images.shape: %s labels.shape: %s" % (images.shape,
                                                 labels.shape))
      self._num_examples = images.shape[0]

      # Convert shape from [num examples, rows, columns, depth]
      # to [num examples, rows*columns] (assuming depth == 1)
      self.imageShape = images.shape[1:]
      self.imageChannels = self.imageShape[2]

      images = images.reshape(images.shape[0],
                              images.shape[1] * images.shape[2] * images.shape[3])
      # Convert from [0, 255] -> [0.0, 1.0].
      images = images.astype(numpy.float32)
      images = numpy.multiply(images, 1.0 / 255.0)
    self._images = images
    self._labels = labels
    try:
      if len(numpy.shape(self._labels)) == 1:
        self._labels = dense_to_one_hot(self._labels,len(numpy.unique(self._labels)))
    except:
      traceback.print_exc()
    self._epochs_completed = 0
    self._index_in_epoch = 0 
Example 44
Project: IntroToDeepLearning   Author: robb-brown   File: input_data.py    MIT License 5 votes vote down vote up
def __init__(self, images, labels, fake_data=False):
    if fake_data:
      self._num_examples = 10000
    else:
      assert images.shape[0] == labels.shape[0], (
          "images.shape: %s labels.shape: %s" % (images.shape,
                                                 labels.shape))
      self._num_examples = images.shape[0]

      # Convert shape from [num examples, rows, columns, depth]
      # to [num examples, rows*columns] (assuming depth == 1)
      self.imageShape = images.shape[1:]
      self.imageChannels = self.imageShape[2]

      images = images.reshape(images.shape[0],
                              images.shape[1] * images.shape[2] * images.shape[3])
      # Convert from [0, 255] -> [0.0, 1.0].
      images = images.astype(numpy.float32)
      images = numpy.multiply(images, 1.0 / 255.0)
    self._images = images
    self._labels = labels
    try:
      if len(numpy.shape(self._labels)) == 1:
        self._labels = dense_to_one_hot(self._labels,len(numpy.unique(self._labels)))
    except:
      traceback.print_exc()
    self._epochs_completed = 0
    self._index_in_epoch = 0 
Example 45
Project: deep-models   Author: LaurentMazare   File: rhn-text8.py    Apache License 2.0 5 votes vote down vote up
def load(filename):
  with open(filename, 'r') as f:
    data = f.read()
  data = np.fromstring(data, dtype=np.uint8)
  unique, data = np.unique(data, return_inverse=True)
  return data, len(unique) 
Example 46
Project: pidforest   Author: vatsalsharan   File: subcube.py    MIT License 5 votes vote down vote up
def get_box(self):
        self.start = np.zeros(self.dim)
        self.end = np.zeros(self.dim)
        self.val = []
        self.count = []
        for axis in range(self.dim):
            val, count = np.unique(np.array(self.points[axis]), return_counts=True)
            self.val.append(val) 
            self.count.append(count)
            if len(self.val[axis]) <= 1:
                self.start[axis] = self.val[axis][0]
                self.end[axis] = self.val[axis][0]
            else:
                self.start[axis] = (3*self.val[axis][0] - self.val[axis][1])/2
                self.end[axis] = (3*self.val[axis][-1] - self.val[axis][-2])/2 
Example 47
Project: pidforest   Author: vatsalsharan   File: old_forest.py    MIT License 5 votes vote down vote up
def fit(self, pts):
        self.points = pts
        if len(np.shape(self.points)) == 1:
            self.dim = 1
            self.size = len(self.points)
            self.points = np.reshape(self.points, (1, self.size))
        else:
            self.dim, self.size = np.shape(self.points)
        if int(self.sample_axis*self.dim) == 0:
            print("sample_axis is too low")
            return
        self.start = np.zeros(self.dim)
        self.end = np.zeros(self.dim)
        for axis in range(self.dim):
            val = np.unique(np.array(self.points[axis]))
            if len(val) <= 1:
                print("No entropy in dimension :", axis)
                return
            self.start[axis] = (3 * val[0] - val[1]) / 2
            self.end[axis] = (3 * val[-1] - val[-2]) / 2
        k_args = {'depth': 0, 'forest': self}
        max_sample_size = np.min((self.size, self.max_depth*50))
        sample = np.random.choice(self.size, max_sample_size, replace=False)
        for i in range(self.n_trees):
            k_args['indices'] = np.random.choice(self.size, self.max_samples, replace=False)
            root_node = Node(**k_args)
            root_node.compute_density(self.points[:, sample])
            self.tree.append(root_node) 
Example 48
Project: pidforest   Author: vatsalsharan   File: old_forest.py    MIT License 5 votes vote down vote up
def get_distinct(self):
        s_val = []
        s_count = []
        for axis in range(self.dim):
            val, count = np.unique(np.array(self.points[axis]), return_counts=True)
            s_val.append(val)
            s_count.append(count)
        return s_val, s_count 
Example 49
Project: sumo   Author: ratan-lab   File: utils.py    MIT License 5 votes vote down vote up
def purity(cl: np.ndarray, org: np.ndarray):
    """ Clustering accuracy measure representing percentage of total number of nodes classified correctly """
    assert cl.shape == org.shape

    acc = 0
    for label in np.unique(cl):
        labels = {}
        for node in range(len(org)):
            if cl[node] == label:
                if org[node] not in labels.keys():
                    labels[org[node]] = 0
                labels[org[node]] += 1
        acc += max(labels.values()) if labels.keys() else 0
    return acc / len(org) 
Example 50
Project: sumo   Author: ratan-lab   File: network.py    MIT License 5 votes vote down vote up
def get_clustering_quality(self, labels: np.ndarray):
        assert labels.shape[0] == self.nodes
        sim = 0
        for i in range(self.layers):
            adj = self.adj_matrices[i]
            sim_layer = 0
            for cluster in np.unique(labels):
                # sum of similarities for all pairs of samples in cluster
                from itertools import combinations
                sim_layer += np.sum(
                    [adj[x, y] for (x, y) in combinations(list(np.argwhere(labels == cluster).T[0]), 2)])
            # normalize using number of samples available in layer
            sim += (sim_layer / (self.samples[i].shape[0] ** 2))
        return sim 
Example 51
Project: tensorflow-alexnet   Author: jireh-father   File: kaggle_mnist_input.py    MIT License 5 votes vote down vote up
def load_mnist_train(validation_size=2000, batch_size=128):
    download_train()

    data = pd.read_csv(FLAGS.train_path)

    images = data.iloc[:, 1:].values
    images = images.astype(np.float)

    images = np.multiply(images, 1.0 / 255.0)

    image_size = images.shape[1]

    image_width = image_height = np.ceil(np.sqrt(image_size)).astype(np.uint8)
    images = images.reshape(-1, image_width, image_height, 1)

    labels_flat = data[[0]].values.ravel()
    labels_count = np.unique(labels_flat).shape[0]

    labels = dense_to_one_hot(labels_flat, labels_count)
    labels = labels.astype(np.uint8)

    validation_images = images[:validation_size]
    validation_labels = labels[:validation_size]

    train_images = images[validation_size:]
    train_labels = labels[validation_size:]

    train_range = zip(range(0, len(train_images), batch_size), range(batch_size, len(train_images), batch_size))

    if len(train_images) % batch_size > 0:
        train_range.append((train_range[-1][1], len(train_images)))

    validation_indices = np.arange(len(validation_images))

    return train_images, train_labels, train_range, validation_images, validation_labels, validation_indices 
Example 52
Project: ArtGAN   Author: cs-chan   File: utils.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def datasetweights(dataset):
    dataset.reset()
    weights = np.zeros(10)
    for x, t in dataset:
        t = t.get().argmax(axis=0)
        i, c = np.unique(t, return_counts=True)
        for ii, cc in zip(i, c):
            weights[ii] += cc
    weights /= dataset.ndata
    return weights 
Example 53
Project: cascade-rcnn_Pytorch   Author: guoruoqian   File: cocoeval.py    MIT License 5 votes vote down vote up
def evaluate(self):
        '''
        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
        :return: None
        '''
        tic = time.time()
        print('Running per image evaluation...')
        p = self.params
        # add backward compatibility if useSegm is specified in params
        if not p.useSegm is None:
            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
        print('Evaluate annotation type *{}*'.format(p.iouType))
        p.imgIds = list(np.unique(p.imgIds))
        if p.useCats:
            p.catIds = list(np.unique(p.catIds))
        p.maxDets = sorted(p.maxDets)
        self.params=p

        self._prepare()
        # loop through images, area range, max detection number
        catIds = p.catIds if p.useCats else [-1]

        if p.iouType == 'segm' or p.iouType == 'bbox':
            computeIoU = self.computeIoU
        elif p.iouType == 'keypoints':
            computeIoU = self.computeOks
        self.ious = {(imgId, catId): computeIoU(imgId, catId) \
                        for imgId in p.imgIds
                        for catId in catIds}

        evaluateImg = self.evaluateImg
        maxDet = p.maxDets[-1]
        self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
                 for catId in catIds
                 for areaRng in p.areaRng
                 for imgId in p.imgIds
             ]
        self._paramsEval = copy.deepcopy(self.params)
        toc = time.time()
        print('DONE (t={:0.2f}s).'.format(toc-tic)) 
Example 54
Project: cascade-rcnn_Pytorch   Author: guoruoqian   File: ds_utils.py    MIT License 5 votes vote down vote up
def unique_boxes(boxes, scale=1.0):
  """Return indices of unique boxes."""
  v = np.array([1, 1e3, 1e6, 1e9])
  hashes = np.round(boxes * scale).dot(v)
  _, index = np.unique(hashes, return_index=True)
  return np.sort(index) 
Example 55
Project: Parallel.GAMIT   Author: demiangomez   File: pyStack.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, polyhedrons):

        # get the mean epoch
        date = [poly.date.mjd for poly in polyhedrons]
        date = Date(mjd=np.mean(date))

        # get the set of stations
        stn = []
        for poly in polyhedrons:
            stn += poly.vertices['stn'].tolist()

        stn = np.unique(stn)

        # average the coordinates for each station
        poly = []
        for s in stn:
            v = np.array([])
            for p in polyhedrons:
                if not v.size:
                    v = p.vertices[p.vertices['stn'] == s]
                else:
                    v = np.concatenate((v, p.vertices[p.vertices['stn'] == s]))

            poly.append((s, np.mean(v['x']), np.mean(v['y']), np.mean(v['z']), date.year, date.doy, date.fyear))

        pp = np.array(poly, dtype=[('stn', 'S8'), ('x', 'float64'), ('y', 'float64'), ('z', 'float64'),
                                   ('yr', 'i4'), ('dd', 'i4'), ('fy', 'float64')])

        super(Combination, self).__init__(pp, polyhedrons[0].project, date) 
Example 56
Project: Parallel.GAMIT   Author: demiangomez   File: pyParallelGamit.py    GNU General Public License v3.0 5 votes vote down vote up
def check_station_codes(stn_obj):

    for i, stn1 in enumerate(stn_obj[:-1]):

        for stn2 in stn_obj[i+1:]:
            if stn1.NetworkCode != stn2.NetworkCode and stn1.StationCode == stn2.StationCode:
                # duplicate StationCode (different Network), produce Alias
                unique = False
                while not unique:
                    stn1.generate_alias()
                    # compare again to make sure this name is unique
                    unique = compare_aliases(stn1, stn_obj)

    return stn_obj 
Example 57
Project: Parallel.GAMIT   Author: demiangomez   File: pyParallelGamit.py    GNU General Public License v3.0 5 votes vote down vote up
def compare_aliases(Station, AllStations):

    # make sure alias does not exists as alias and station code

    for stn in AllStations:

        # this if prevents comparing against myself, although the station is not added until after
        # the call to CompareAliases. But, just in case...
        if stn.StationCode != Station.StationCode and stn.NetworkCode != Station.NetworkCode and \
                        Station.StationAlias == stn.StationAlias or Station.StationAlias == stn.StationCode:
            # not unique!
            return False

    return True 
Example 58
Project: Parallel.GAMIT   Author: demiangomez   File: test_voronoi.py    GNU General Public License v3.0 5 votes vote down vote up
def check_station_codes(stn_obj):

    for i, stn1 in enumerate(stn_obj[:-1]):

        for stn2 in stn_obj[i+1:]:
            if stn1.NetworkCode != stn2.NetworkCode and stn1.StationCode == stn2.StationCode:
                # duplicate StationCode (different Network), produce Alias
                unique = False
                while not unique:
                    stn1.generate_alias()
                    # compare again to make sure this name is unique
                    unique = compare_aliases(stn1, stn_obj)

    return stn_obj 
Example 59
Project: Parallel.GAMIT   Author: demiangomez   File: test_voronoi.py    GNU General Public License v3.0 5 votes vote down vote up
def compare_aliases(Station, AllStations):

    # make sure alias does not exists as alias and station code

    for stn in AllStations:

        # this if prevents comparing against myself, although the station is not added until after
        # the call to CompareAliases. But, just in case...
        if stn.StationCode != Station.StationCode and stn.NetworkCode != Station.NetworkCode and \
                        Station.StationAlias == stn.StationAlias or Station.StationAlias == stn.StationCode:
            # not unique!
            return False

    return True 
Example 60
Project: torch-toolbox   Author: PistonY   File: functional.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def poisson_noise(img):
    imgtype = img.dtype
    img = img.astype(np.float32) / 255.0
    vals = len(np.unique(img))
    vals = 2 ** np.ceil(np.log2(vals))
    noisy = 255 * np.clip(np.random.poisson(img.astype(np.float32) * vals) / float(vals), 0, 1)
    return noisy.astype(imgtype) 
Example 61
Project: python-toolbox-for-rapid   Author: Esri   File: CreateInflowFileFromECMWFRunoff.py    Apache License 2.0 5 votes vote down vote up
def dataIdentify(self, in_nc):
        """Check if the data is Ensemble 1-51 (low resolution) or 52 (high resolution)"""
        data_nc = NET.Dataset(in_nc)
        name_time = self.vars_oi[2]
        time = data_nc.variables[name_time][:]
        diff = NUM.unique(NUM.diff(time))
        data_nc.close()
        time_interval_highres = NUM.array([1.0,3.0,6.0],dtype=float)
        time_interval_lowres = NUM.array([6.0],dtype=float)
        if (diff == time_interval_highres).all():
            return "HighRes"
        elif (diff == time_interval_lowres).all():
            return "LowRes"
        else:
            return None 
Example 62
Project: sfcc   Author: kv-kunalvyas   File: auxiliary.py    MIT License 4 votes vote down vote up
def initialise_train(dates):
    if not dates:
        data_frame = pd.read_csv('../data/train.csv', header=0)
    elif dates:
        data_frame = pd.read_csv('../data/train.csv', header=0, parse_dates=['Dates'])
        data_frame['Year'] = data_frame['Dates'].map(lambda x: x.year)
        data_frame['Month'] = data_frame['Dates'].map(lambda x: x.month)
        data_frame['Week'] = data_frame['Dates'].map(lambda x: x.week)
        data_frame['Hour'] = data_frame['Dates'].map(lambda x: x.hour)

    # Change string categories to integer classifiers
    # 1. determine all values
    Categories = list(enumerate(sorted(np.unique(data_frame['Category']))))
    Descriptions = list(enumerate(sorted(np.unique(data_frame['Descript']))))
    DaysOfWeeks = list(enumerate(sorted(np.unique(data_frame['DayOfWeek']))))
    PdDistricts = list(enumerate(sorted(np.unique(data_frame['PdDistrict']))))
    Resolutions = list(enumerate(sorted(np.unique(data_frame['Resolution']))))
    # 2. set up dictionaries
    CategoriesDict = {name: i for i, name in Categories}
    DescriptionsDict = {name: i for i, name in Descriptions}
    DaysOfWeeksDict = {name: i for i, name in DaysOfWeeks}
    PdDistrictsDict = {name: i for i, name in PdDistricts}
    ResolutionsDict = {name: i for i, name in Resolutions}
    # 3. Convert all strings to int
    data_frame.Category = data_frame.Category.map(lambda x: CategoriesDict[x]).astype(int)
    data_frame.Descript = data_frame.Descript.map(lambda x: DescriptionsDict[x]).astype(int)
    data_frame.DayOfWeek = data_frame.DayOfWeek.map(lambda x: DaysOfWeeksDict[x]).astype(int)
    data_frame.PdDistrict = data_frame.PdDistrict.map(lambda x: PdDistrictsDict[x]).astype(int)
    data_frame.Resolution = data_frame.Resolution.map(lambda x: ResolutionsDict[x]).astype(int)

    xy_scaler = pp.StandardScaler()
    xy_scaler.fit(data_frame[["X", "Y"]])
    data_frame[["X", "Y"]] = xy_scaler.transform(data_frame[["X", "Y"]])
    data_frame["rot45_X"] = .707 * data_frame["Y"] + .707 * data_frame["X"]
    data_frame["rot45_Y"] = .707 * data_frame["Y"] - .707 * data_frame["X"]
    data_frame["rot30_X"] = (1.732 / 2) * data_frame["X"] + (1. / 2) * data_frame["Y"]
    data_frame["rot30_Y"] = (1.732 / 2) * data_frame["Y"] - (1. / 2) * data_frame["X"]
    data_frame["rot60_X"] = (1. / 2) * data_frame["X"] + (1.732 / 2) * data_frame["Y"]
    data_frame["rot60_Y"] = (1. / 2) * data_frame["Y"] - (1.732 / 2) * data_frame["X"]
    data_frame["radial_r"] = np.sqrt(np.power(data_frame["Y"], 2) + np.power(data_frame["X"], 2))

    # rounding off location coordinates to 2 decimal points
    data_frame.X = data_frame.X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.Y = data_frame.Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot45_X = data_frame.rot45_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot45_Y = data_frame.rot45_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot30_X = data_frame.rot30_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot30_Y = data_frame.rot30_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot60_X = data_frame.rot60_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot60_Y = data_frame.rot60_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.radial_r = data_frame.radial_r.map(lambda x: "%.2f" % round(x, 2)).astype(float)

    return data_frame 
Example 63
Project: sfcc   Author: kv-kunalvyas   File: auxiliary.py    MIT License 4 votes vote down vote up
def initialise_test(dates):
    if not dates:
        data_frame = pd.read_csv('../data/test.csv', header=0)
    elif dates:
        data_frame = pd.read_csv('../data/test.csv', header=0, parse_dates=['Dates'])
        data_frame['Year'] = data_frame['Dates'].map(lambda x: x.year)
        data_frame['Month'] = data_frame['Dates'].map(lambda x: x.month)
        data_frame['Week'] = data_frame['Dates'].map(lambda x: x.week)
        data_frame['Hour'] = data_frame['Dates'].map(lambda x: x.hour)

    # Change string categories to integer classifiers
    PdDistricts = list(enumerate(sorted(np.unique(data_frame['PdDistrict']))))
    DaysOfWeeks = list(enumerate(sorted(np.unique(data_frame['DayOfWeek']))))
    PdDistrictsDict = {name: i for i, name in PdDistricts}
    DaysOfWeeksDict = {name: i for i, name in DaysOfWeeks}
    data_frame.PdDistrict = data_frame.PdDistrict.map(lambda x: PdDistrictsDict[x]).astype(int)
    data_frame.DayOfWeek = data_frame.DayOfWeek.map(lambda x: DaysOfWeeksDict[x]).astype(int)

    xy_scaler = pp.StandardScaler()
    xy_scaler.fit(data_frame[["X", "Y"]])
    data_frame[["X", "Y"]] = xy_scaler.transform(data_frame[["X", "Y"]])
    data_frame["rot45_X"] = .707 * data_frame["Y"] + .707 * data_frame["X"]
    data_frame["rot45_Y"] = .707 * data_frame["Y"] - .707 * data_frame["X"]
    data_frame["rot30_X"] = (1.732 / 2) * data_frame["X"] + (1. / 2) * data_frame["Y"]
    data_frame["rot30_Y"] = (1.732 / 2) * data_frame["Y"] - (1. / 2) * data_frame["X"]
    data_frame["rot60_X"] = (1. / 2) * data_frame["X"] + (1.732 / 2) * data_frame["Y"]
    data_frame["rot60_Y"] = (1. / 2) * data_frame["Y"] - (1.732 / 2) * data_frame["X"]
    data_frame["radial_r"] = np.sqrt(np.power(data_frame["Y"], 2) + np.power(data_frame["X"], 2))

    # rounding off location coordinates to 2 decimal points
    data_frame.X = data_frame.X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.Y = data_frame.Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot45_X = data_frame.rot45_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot45_Y = data_frame.rot45_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot30_X = data_frame.rot30_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot30_Y = data_frame.rot30_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot60_X = data_frame.rot60_X.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.rot60_Y = data_frame.rot60_Y.map(lambda x: "%.2f" % round(x, 2)).astype(float)
    data_frame.radial_r = data_frame.radial_r.map(lambda x: "%.2f" % round(x, 2)).astype(float)

    return data_frame


# TODO: Fill missing values if any
# Compute mean of a column and fill missing values 
Example 64
Project: rhodonite   Author: nestauk   File: phylomemetic.py    MIT License 4 votes vote down vote up
def label_density(g, cooccurrence_graphs, norm=None):
    """label_density
    Creates a property map with the density of each vertex based on the items
    contained within the community it represents.
    Requires a cooccurrence graphs.

    Parameters
    ----------
        g : :obj:`graph_tool.Graph` 
            A graph.
        cooccurrence_graphs : :obj:`iter` of :obj:`Graph` 
            A list of cooccurrence graphs for each time period.
        norm :obj:`function`: 
            A normalisation function.

    Returns
    -------
        community_densities : :obj:`PropertyMap` A property map containing the
            densities of the phylomemetic graph vertices.
    """
    community_densities = g.new_vertex_property('float')

    time_steps = sorted(np.unique(g.vp['label'].a))
    
    co_time_step = None
    for v in g.vertices():
        ts = g.vp['label'][v]
        if ts != co_time_step:
            co_time_step = ts
            co = cooccurrence_graphs[ts]
            o_source = edge_endpoint_property(co, co.vp['occurrence'], 'source') 
            o_target = edge_endpoint_property(co, co.vp['occurrence'], 'target')

            density_prop = co.new_edge_property('float')
            density_prop.a = (
                    np.square(co.ep['cooccurrence'].a) / 
                    (o_source.a * o_target.a)
                    )
        community_densities[v] = community_density(
                g.vp['item'][v], 
                co,
                density_prop
                )
    return community_densities 
Example 65
Project: Collaborative-Learning-for-Weakly-Supervised-Object-Detection   Author: Sunarker   File: minibatch.py    MIT License 4 votes vote down vote up
def get_minibatch(roidb, num_classes):
  """Given a roidb, construct a minibatch sampled from it."""
  num_images = len(roidb)
  # Sample random scales to use for each image in this batch
  random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
                  size=num_images)
  assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
    'num_images ({}) must divide BATCH_SIZE ({})'. \
    format(num_images, cfg.TRAIN.BATCH_SIZE)

  # Get the input image blob, formatted for caffe
  im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)

  blobs = {'data': im_blob}

  assert len(im_scales) == 1, "Single batch only"
  assert len(roidb) == 1, "Single batch only"
  
  # gt boxes: (x1, y1, x2, y2, cls)
  #if cfg.TRAIN.USE_ALL_GT:
    # Include all ground truth boxes
  #  gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
  #else:
    # For the COCO ground truth boxes, exclude the ones that are ''iscrowd'' 
  #  gt_inds = np.where(roidb[0]['gt_classes'] != 0 & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0]
  #gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
  #gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
  #gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
  boxes = roidb[0]['boxes'] * im_scales[0]
  batch_ind = 0 * np.ones((boxes.shape[0], 1))
  boxes = np.hstack((batch_ind, boxes))
  DEDUP_BOXES=1./16.
  if DEDUP_BOXES > 0:
    v = np.array([1,1e3, 1e6, 1e9, 1e12])
    hashes = np.round(boxes * DEDUP_BOXES).dot(v)
    _, index, inv_index = np.unique(hashes, return_index=True,
                                    return_inverse=True)
    boxes = boxes[index, :]
  
  blobs['boxes'] = boxes
  blobs['im_info'] = np.array(
    [im_blob.shape[1], im_blob.shape[2], im_scales[0]],
    dtype=np.float32)
  blobs['labels'] = roidb[0]['labels']

  return blobs 
Example 66
Project: prediction-constrained-topic-models   Author: dtak   File: select_best_runs_and_snapshots.py    MIT License 4 votes vote down vote up
def simplify_best_df_and_make_unicode_friendly(
        best_df,
        replacements={'WEIGHT_Y':'λ', '==':'=', "'":""},
        replacements_ascii={'λ':'WEIGHT_Y', '=':''},
        at_best_keys=[
            'LOGPDF_X_PERTOK_AT_BEST_SNAPSHOT',
            'LOGPDF_Y_PERDOC_AT_BEST_SNAPSHOT'],
        ):
    ''' Update legend names to be shorter/unicode friendly.

    Also adds _AT_BEST_SNAPSHOT fields
    '''
    legcolid = best_df.columns.tolist().index('LEGEND_NAME')
    best_df["LEGEND_NAME"] = best_df["LEGEND_NAME"].apply(lambda x: unicode(x))
    best_df["LEGEND_NAME_ASCII"] = best_df["LEGEND_NAME"].apply(lambda x: unicode(x))
    for row_id in range(best_df.shape[0]):
        leg_str = best_df.iloc[row_id, legcolid]
        for before, after in replacements.items():
            leg_str = leg_str.replace(before, after)
        leg_str = ' '.join(leg_str.split())
        best_df.iloc[row_id, legcolid] = leg_str

        # Now make ascii-safe version of each name
        leg_str_ascii = leg_str
        for before, after in replacements_ascii.items():
            leg_str_ascii = leg_str_ascii.replace(before, after)
        best_df.loc[row_id, 'LEGEND_NAME_ASCII'] = (
            ' '.join(leg_str_ascii.decode('ascii').split())).replace(' ', '_')
        
    at_best_row_mask = best_df.IS_BEST_SNAPSHOT.values > 0
    for leg in np.unique(best_df['_UNIQUE_LEGEND_NAME'].values):
        for split in np.unique(best_df['SPLIT_NAME'].values):
            leg_split_row_mask = np.logical_and(
                best_df._UNIQUE_LEGEND_NAME.values == leg,
                best_df.SPLIT_NAME.values == split)
            best_leg_split_row_mask = np.logical_and(
                at_best_row_mask, leg_split_row_mask)

            assert np.sum(best_leg_split_row_mask) == 1
            assert np.sum(best_leg_split_row_mask) < np.sum(leg_split_row_mask)
            for at_best_key in at_best_keys:
                target_key = at_best_key.replace('_AT_BEST_SNAPSHOT', '')
                best_leg_split_row_id = np.flatnonzero(best_leg_split_row_mask)[0]
                val_at_best = best_df[target_key].values[best_leg_split_row_id]
                best_df.loc[leg_split_row_mask, at_best_key] = val_at_best

    # Verify all row indices are unique
    assert best_df.shape[0] == np.unique(best_df.index.values).size

    return best_df 
Example 67
Project: FRIDA   Author: LCAV   File: bands_selection.py    MIT License 4 votes vote down vote up
def select_bands(samples, freq_range, fs, nfft, win, n_bands, div=1):
    '''
    Selects the bins with most energy in a frequency range.

    It is possible to specify a div factor. Then the range is subdivided
    into div equal subbands and n_bands / div per subband are selected.
    '''

    if win is not None and isinstance(win, bool):
        if win:
            win = np.hanning(nfft)
        else:
            win = None

    # Read the signals in a single array
    sig = [wavfile.read(s)[1] for s in samples]
    L = max([s.shape[0] for s in sig])
    signals = np.zeros((L,len(samples)), dtype=np.float32)
    for i in range(signals.shape[1]):
        signals[:sig[i].shape[0],i] = sig[i] / np.std(sig[i][sig[i] > 1e-2])

    sum_sig = np.sum(signals, axis=1)

    sum_STFT = pra.stft(sum_sig, nfft, nfft, win=win, transform=rfft).T
    sum_STFT_avg = np.mean(np.abs(sum_STFT)**2, axis=1)

    # Do some band selection
    bnds = np.linspace(freq_range[0], freq_range[1], div+1)

    freq_hz = np.zeros(n_bands)
    freq_bins = np.zeros(n_bands, dtype=int)

    nsb = n_bands // div

    for i in range(div):

        bl = int(bnds[i] / fs * nfft)
        bh = int(bnds[i+1] / fs * nfft)

        k = np.argsort(sum_STFT_avg[bl:bh])[-nsb:]

        freq_hz[nsb*i:nsb*(i+1)] = (bl + k) / nfft * fs
        freq_bins[nsb*i:nsb*(i+1)] = k + bl

    freq_hz = freq_hz[:n_bands]

    return np.unique(freq_hz), np.unique(freq_bins) 
Example 68
Project: uci-download-process   Author: cperales   File: fold_data.py    MIT License 4 votes vote down vote up
def k_folding(data_folder, log_file, file=None, classification=True):
    dir_file_pairs = dir_file(data_folder, file)

    # SPLITTING ONE DATASET FILE IN N_FOLDS
    n_fold = 10
    with open(log_file, 'w') as f:
        for dir_file_pair in dir_file_pairs:
            try:
                dir_name, file_name = dir_file_pair
                # print('Folding {}'.format(file_name))
                df_file = pd.read_csv(os.path.join(dir_name, file_name),
                                      sep='\s+',
                                      header=None)
                target_position = df_file.columns[-1]
                x = df_file[[i for i in range(target_position)]]
                y = df_file[[target_position]]
                # Testing if there is enough instances for n fold
                count = [np.count_nonzero(y == label) for label in np.unique(y)]
                if np.min(count) < 2:
                    raise ValueError('Not enough elements of one label')
                rep = np.max(count)  # If maximum is not enough to n fold
                if n_fold > rep:
                    times = math.ceil(n_fold / rep)
                    x = pd.concat(times * [x])
                    y = pd.concat(times * [y])
                # Shuffle false in order to preserve
                i = 0
                file = file_name.replace('.data', '')
                if classification is True:
                    kf = StratifiedKFold(n_splits=n_fold, shuffle=False)
                else:
                    kf = KFold(n_splits=n_fold, shuffle=True)
                for train_index, test_index in kf.split(X=x, y=y):
                    x_train_fold = x.iloc[train_index]
                    y_train_fold = y.iloc[train_index]
                    train_fold = pd.concat([x_train_fold, y_train_fold], axis=1)
                    train_fold_name = '.'.join(['_'.join(['train', file]), str(i)])
                    train_fold_name_path = os.path.join(dir_name, train_fold_name)
                    train_fold.to_csv(train_fold_name_path,
                                      sep=' ',
                                      header=None,
                                      index=False)

                    x_test_fold = x.iloc[test_index]
                    y_test_fold = y.iloc[test_index]
                    test_fold = pd.concat([x_test_fold, y_test_fold], axis=1)
                    test_fold_name = '.'.join(['_'.join(['test', file]), str(i)])
                    test_fold_name_path = os.path.join(dir_name, test_fold_name)
                    test_fold.to_csv(test_fold_name_path,
                                     sep=' ',
                                     header=None,
                                     index=False)

                    i += 1
            except ValueError as e:
                print(e, ', '
                         'so {} can\'t be stratified'.format(file_name))
                f.write(os.path.join('processed/', file_name))
                f.write('\n')
                shutil.rmtree(dir_name) 
Example 69
Project: skylab   Author: coenders   File: ps_injector.py    GNU General Public License v3.0 4 votes vote down vote up
def sample(self, src_ra, mean_mu, poisson=True):
        r"""Sample events for given source location.

        Parameters
        -----------
        src_ra : float
            Right ascension of source position
        mean_mu : float
            Mean number of events to sample
        poisson : bool, optional
            Use Poisson fluctuations, otherwise sample `mean_mu`.

        Returns
        --------
        num : int
            Number of events
        sam_ev : iterator
            Sampled events for each loop iteration; either as simple
            array or as dictionary for each sample

        """
        while True:
            # Generate event numbers using Poisson events.
            if poisson:
                num = self.random.poisson(mean_mu)
            else:
                num = int(np.around(mean_mu))

            self._logging.info("Mean number of events {0:.1f}".format(mean_mu))
            self._logging.info("Generated number of events {0:d}".format(num))

            if num < 1:
                # No events will be sampled.
                yield num, None
                continue

            sam_idx = self.random.choice(self.mc_arr, size=num, p=self._norm_w)

            # Get the events that were sampled.
            enums = np.unique(sam_idx["enum"])

            if len(enums) == 1 and enums[0] < 0:
                # Only one event will be sampled.
                sam_ev = np.copy(self.mc[enums[0]][sam_idx["idx"]])
                yield num, rotate_struct(sam_ev, src_ra, self.src_dec)
                continue

            sam_ev = dict()
            for enum in enums:
                idx = sam_idx[sam_idx["enum"] == enum]["idx"]
                sam_ev_i = np.copy(self.mc[enum][idx])
                sam_ev[enum] = rotate_struct(sam_ev_i, src_ra, self.src_dec)

            yield num, sam_ev 
Example 70
Project: DOTA_models   Author: ringringyi   File: data_utils.py    Apache License 2.0 4 votes vote down vote up
def write_datafiles(directory, write_file,
                    resize=True, rotate=False,
                    new_width=IMAGE_NEW_SIZE, new_height=IMAGE_NEW_SIZE,
                    first_label=0):
  """Load and preprocess images from a directory and write them to a file.

  Args:
    directory: Directory of alphabet sub-directories.
    write_file: Filename to write to.
    resize: Whether to resize the images.
    rotate: Whether to augment the dataset with rotations.
    new_width: New resize width.
    new_height: New resize height.
    first_label: Label to start with.

  Returns:
    Number of new labels created.
  """

  # these are the default sizes for Omniglot:
  imgwidth = IMAGE_ORIGINAL_SIZE
  imgheight = IMAGE_ORIGINAL_SIZE

  logging.info('Reading the data.')
  images, labels, info = crawl_directory(directory,
                                         augment_with_rotations=rotate,
                                         first_label=first_label)

  images_np = np.zeros([len(images), imgwidth, imgheight], dtype=np.bool)
  labels_np = np.zeros([len(labels)], dtype=np.uint32)
  for i in xrange(len(images)):
    images_np[i, :, :] = images[i]
    labels_np[i] = labels[i]

  if resize:
    logging.info('Resizing images.')
    resized_images = resize_images(images_np, new_width, new_height)

    logging.info('Writing resized data in float32 format.')
    data = {'images': resized_images,
            'labels': labels_np,
            'info': info}
    with tf.gfile.GFile(write_file, 'w') as f:
      pickle.dump(data, f)
  else:
    logging.info('Writing original sized data in boolean format.')
    data = {'images': images_np,
            'labels': labels_np,
            'info': info}
    with tf.gfile.GFile(write_file, 'w') as f:
      pickle.dump(data, f)

  return len(np.unique(labels_np)) 
Example 71
Project: DOTA_models   Author: ringringyi   File: graph_utils.py    Apache License 2.0 4 votes vote down vote up
def rng_room_to_room(batch_size, gtG, rng, max_dist, max_dist_to_compute,
                     node_room_ids, nodes=None, compute_path=False):
  # Sample one of the rooms, compute the distance field. Pick a destination in
  # another room if possible otherwise anywhere outside this room.
  dists = []; pred_maps = []; paths = []; start_node_ids = []; end_node_ids = [];
  room_ids = np.unique(node_room_ids[node_room_ids[:,0] >= 0, 0])
  for i in range(batch_size):
    room_id = rng.choice(room_ids)
    end_node_id = rng.choice(np.where(node_room_ids[:,0] == room_id)[0])
    end_node_ids.append(end_node_id)

    # Compute distances.
    dist, pred_map = gt.topology.shortest_distance(
        gt.GraphView(gtG, reversed=True), source=gtG.vertex(end_node_id),
        target=None, max_dist=max_dist_to_compute, pred_map=True)
    dist = np.array(dist.get_array())
    pred_map = np.array(pred_map.get_array())
    dists.append(dist)
    pred_maps.append(pred_map)

    # Randomly sample nodes which are within max_dist.
    near_ids = dist <= max_dist
    near_ids = near_ids[:, np.newaxis]

    # Check to see if there is a non-negative node which is close enough.
    non_same_room_ids = node_room_ids != room_id
    non_hallway_ids = node_room_ids != -1
    good1_ids = np.logical_and(near_ids, np.logical_and(non_same_room_ids, non_hallway_ids))
    good2_ids = np.logical_and(near_ids, non_hallway_ids)
    good3_ids = near_ids
    if np.any(good1_ids):
      start_node_id = rng.choice(np.where(good1_ids)[0])
    elif np.any(good2_ids):
      start_node_id = rng.choice(np.where(good2_ids)[0])
    elif np.any(good3_ids):
      start_node_id = rng.choice(np.where(good3_ids)[0])
    else:
      logging.error('Did not find any good nodes.')

    start_node_ids.append(start_node_id)

    path = None
    if compute_path:
      path = get_path_ids(start_node_ids[i], end_node_ids[i], pred_map)
    paths.append(path)

  return start_node_ids, end_node_ids, dists, pred_maps, paths 
Example 72
Project: gullikson-scripts   Author: kgullikson88   File: StellarModel.py    MIT License 4 votes vote down vote up
def __init__(self, filename, ranges={"temp": (0, np.inf),
                                         "logg": (-np.inf, np.inf),
                                         "Z": (-np.inf, np.inf),
                                         "alpha": (-np.inf, np.inf)}):
        '''
            :param filename: the name of the HDF5 file
            :type param: string
            :param ranges: optionally select a smaller part of the grid to use.
            :type ranges: dict
        '''
        self.filename = filename
        self.flux_name = "t{temp:.0f}g{logg:.1f}z{Z:.1f}a{alpha:.1f}"
        grid_parameters = ("temp", "logg", "Z", "alpha")  # Allowed grid parameters
        grid_set = frozenset(grid_parameters)

        with h5py.File(self.filename, "r") as hdf5:
            self.wl = hdf5["wl"][:]
            self.wl_header = dict(hdf5["wl"].attrs.items())

            grid_points = []

            for key in hdf5["flux"].keys():
                # assemble all temp, logg, Z, alpha keywords into a giant list
                hdr = hdf5['flux'][key].attrs

                params = {k: hdr[k] for k in grid_set}

                #Check whether the parameters are within the range
                for kk, vv in params.items():
                    low, high = ranges[kk]
                    if (vv < low) or (vv > high):
                        break
                else:
                    #If all parameters have passed successfully through the ranges, allow.
                    grid_points.append(params)

            self.list_grid_points = grid_points

        # determine the bounding regions of the grid by sorting the grid_points
        temp, logg, Z, alpha = [], [], [], []
        for param in self.list_grid_points:
            temp.append(param['temp'])
            logg.append(param['logg'])
            Z.append(param['Z'])
            alpha.append(param['alpha'])

        self.bounds = {"temp": (min(temp), max(temp)),
                       "logg": (min(logg), max(logg)),
                       "Z": (min(Z), max(Z)),
                       "alpha": (min(alpha), max(alpha))}

        self.points = {"temp": np.unique(temp),
                       "logg": np.unique(logg),
                       "Z": np.unique(Z),
                       "alpha": np.unique(alpha)}

        self.ind = None  #Overwritten by other methods using this as part of a ModelInterpolator 
Example 73
Project: SpikeyTree   Author: paula-tataru   File: utils.py    GNU General Public License v3.0 4 votes vote down vote up
def read_data(filename):
    '''read data from file'''
    f = open(filename)
    # read tree
    tree = f.readline()
    # read sample sizes
    aux = f.readline()
    n = np.array(map(float, aux.split()))
    # read labels
    labels = f.readline().split()
    
    row_data = np.empty(len(labels))
    data_matrix = np.empty([0, len(labels)])
    freq_matrix = np.empty([0, len(labels)])
    # read the entries in the data matrix
    for line in f:
        row_data = map(int, line.split())
        data_matrix = np.vstack((data_matrix, row_data))
        freq_matrix = np.vstack((freq_matrix, row_data/n))
    f.close()
    
    # identify unique rows in the data
    bins = np.ascontiguousarray(data_matrix)
    bins = bins.view(np.dtype((np.void, 
                         data_matrix.dtype.itemsize * data_matrix.shape[1])))
    _, idx, count = np.unique(bins, return_index=True, return_counts=True)
    
    data = {}
    sample = {}
    for i,l in enumerate(labels):
        data[l] = []
        sample[l] = n[i]
    # store unique data
    for j in idx:
        for i, l in enumerate(labels):
            data[l].append(data_matrix[j, i]) 
    
    # add fake fixed and lost sites
    for l in labels:
        data[l].append(0)
        data[l].append(sample[l])
    
    return tree, [data, count, sample, [np.mean(freq_matrix), np.var(freq_matrix)]] 
Example 74
Project: CIMtools   Author: stsouko   File: similarity_distance.py    GNU General Public License v3.0 4 votes vote down vote up
def fit(self, X, y=None):
        """Fit distance-based AD.
        All AD’s model hyperparameters were selected based on internal cross-validation using training set.
        The hyperparameters of the AD definition approach have been optimized in the cross-validation,
        where metrics RMSE_AD or BA_AD were used as maximized scoring functions.
        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        self : object
            Returns self.
        """
        # Check data
        X = check_array(X)
        self.tree = BallTree(X, leaf_size=self.leaf_size, metric=self.metric)
        dist_train = self.tree.query(X, k=2)[0]
        if self.threshold == 'auto':
            self.threshold_value = 0.5 * sqrt(var(dist_train[:, 1])) + mean(dist_train[:, 1])
        elif self.threshold == 'cv':
            if y is None:
                raise ValueError("Y must be specified to find the optimal threshold.")
            y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
            self.threshold_value = 0
            score_value = 0
            Y_pred, Y_true, AD = [], [], []
            cv = KFold(n_splits=5, random_state=1, shuffle=True)
            for train_index, test_index in cv.split(X):
                x_train = safe_indexing(X, train_index)
                x_test = safe_indexing(X, test_index)
                y_train = safe_indexing(y, train_index)
                y_test = safe_indexing(y, test_index)
                data_test = safe_indexing(dist_train[:, 1], test_index)
                if self.reg_model is None:
                    reg_model = RandomForestRegressor(n_estimators=500, random_state=1).fit(x_train, y_train)
                else:
                    reg_model = clone(self.reg_model).fit(x_train, y_train)
                Y_pred.append(reg_model.predict(x_test))
                Y_true.append(y_test)
                AD.append(data_test)
            AD_stack = hstack(AD)
            AD_ = unique(AD_stack)
            for z in AD_:
                AD_new = AD_stack <= z
                if self.score == 'ba_ad':
                    val = balanced_accuracy_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
                elif self.score == 'rmse_ad':
                    val = rmse_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
                if val >= score_value:
                    score_value = val
                    self.threshold_value = z
        else:
            self.threshold_value = self.threshold
        return self 
Example 75
Project: CIMtools   Author: stsouko   File: leverage.py    GNU General Public License v3.0 4 votes vote down vote up
def fit(self, X, y=None):
        """Learning is to find the inverse matrix for X and calculate the threshold.
        All AD’s model hyperparameters were selected based on internal cross-validation using training set.
        The hyperparameters of the AD definition approach have been optimized in the cross-validation,
        where metrics RMSE_AD or BA_AD were used as maximized scoring functions.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            The input samples. Use ``dtype=np.float32`` for maximum
            efficiency.
        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (real numbers in regression).

        Returns
        -------
        self : object
        """
        # Check that X have correct shape
        X = check_array(X)
        self.inverse_influence_matrix = self.__make_inverse_matrix(X)
        if self.threshold == 'auto':
            self.threshold_value = 3 * (1 + X.shape[1]) / X.shape[0]
        elif self.threshold == 'cv':
            if y is None:
                raise ValueError("Y must be specified to find the optimal threshold.")
            y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
            self.threshold_value = 0
            score_value = 0
            Y_pred, Y_true, AD = [], [], []
            cv = KFold(n_splits=5, random_state=1, shuffle=True)
            for train_index, test_index in cv.split(X):
                x_train = safe_indexing(X, train_index)
                x_test = safe_indexing(X, test_index)
                y_train = safe_indexing(y, train_index)
                y_test = safe_indexing(y, test_index)
                if self.reg_model is None:
                    reg_model = RandomForestRegressor().fit(x_train, y_train)
                else:
                    reg_model = clone(self.reg_model).fit(x_train, y_train)
                Y_pred.append(reg_model.predict(x_test))
                Y_true.append(y_test)
                ad_model = self.__make_inverse_matrix(x_train)
                AD.append(self.__find_leverages(x_test, ad_model))
            AD_stack = hstack(AD)
            AD_ = unique(AD_stack)
            for z in AD_:
                AD_new = AD_stack <= z
                if self.score == 'ba_ad':
                    val = balanced_accuracy_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
                else:
                    val = rmse_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
                if val >= score_value:
                    score_value = val
                    self.threshold_value = z
        else:
            self.threshold_value = self.threshold
        return self 
Example 76
Project: CIMtools   Author: stsouko   File: reaction_type_control_selection.py    GNU General Public License v3.0 4 votes vote down vote up
def rtc_env_selection(X, y, data, envs, reg_model, score):
    """
    Function for finding the best number of neighbours in ReactionTypeControl method.

    All AD’s model hyperparameters were selected based on internal cross-validation using training set.
    The hyperparameters of the AD definition approach have been optimized in the cross-validation,
    where metrics RMSE_AD or BA_AD were used as maximized scoring functions.

    :param X: array-like or sparse matrix, shape (n_samples, n_features)
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
             to a sparse ``csr_matrix``.
    :param y: array-like, shape = [n_samples] or [n_samples, n_outputs]
             The target values (real numbers in regression).
    :param data: after read rdf file
    :param envs: list or tuple. Numbers of neighbours.
    :param reg_model: estimator
    :param score: 'ba_score' or 'rmse_score'
    :return: int
    """
    X = check_array(X)
    y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    data = iter2array(data, dtype=ReactionContainer)

    if isinstance(envs, (list, tuple)) == False:
        raise ValueError('envs must be list or tuple.')
    if reg_model is None:
        raise ValueError('Model is not defined.')
    if score not in ('ba_ad', 'rmse_ad'):
        raise ValueError('Invalid value for score. Allowed string values are "ba_ad", "rmse_ad".')

    cv = KFold(n_splits=5, shuffle=True, random_state=1)
    score_value = 0
    env_value = 0
    for env in envs:

        Y_pred, Y_true, AD = [], [], []
        for train_index, test_index in cv.split(X):
            x_train = safe_indexing(X, train_index)
            x_test = safe_indexing(X, test_index)
            y_train = safe_indexing(y, train_index)
            y_test = safe_indexing(y, test_index)
            data_train = safe_indexing(data, train_index)
            data_test = safe_indexing(data, test_index)
            Y_pred.append(reg_model.fit(x_train, y_train).predict(x_test))
            Y_true.append(y_test)
            AD.append(ReactionTypeControl(env=env).fit(data_train).predict(data_test))
        AD_stack = hstack(AD)
        AD_ = unique(AD_stack)
        for z in AD_:
            AD_new = AD_stack <= z
        if score == 'ba_ad':
            val = balanced_accuracy_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
        elif score == 'rmse_ad':
            val = rmse_score_with_ad(Y_true=hstack(Y_true), Y_pred=hstack(Y_pred), AD=AD_new)
        if val >= score_value:
            score_value = val
            env_value = env
    return env_value 
Example 77
Project: specio   Author: paris-saclay-cds   File: cli.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def main():
    import argparse
    parser = argparse.ArgumentParser(
        prog='specio',
        description='Python input/output for spectroscopic files')

    subparsers = parser.add_subparsers(metavar='', dest='sub')

    convert_parser = subparsers.add_parser(
        'convert',
        description='Convert spectroscopic files.',
        help='Convert spectroscopic files.')

    convert_parser.add_argument(
        'filepath', help='The file to convert. Wildcard are accepted'
        ' (e.g. "*.spc")')

    convert_parser.add_argument(
        '--output', '-o', nargs=1,
        help='The output file path. If not specified, use same path and name '
             'as input with different extension.')

    convert_parser.add_argument(
        '--tolerance', '-t', nargs=1, type=float, default=[1e-5],
        help='Tolerance to merge spectrum when their wavelength are slightly '
             'different (default=1e-5)')

    args = parser.parse_args()

    if args.sub == 'convert':
        filenames = _validate_filenames(args.filepath)
        tol_wavelength = args.tolerance[0]
        spectrum = specread(filenames, tol_wavelength=tol_wavelength)

        # case that we could not merge the spectra together
        if isinstance(spectrum, list):
            if args.output:
                # remove the extension in case that the user gave one
                output_basename, _ = os.path.splitext(args.output[0])
                for idx, sp in enumerate(spectrum):
                    sp.to_csv(output_basename + '_{}.csv'.format(idx))
            else:
                output_basename = [sp.meta['filename'] for sp in spectrum]
                if np.unique(output_basename).size == len(output_basename):
                    for name, sp in zip(output_basename, spectrum):
                        sp.to_csv(os.path.splitext(name)[0] + '.csv')
                else:
                    basename = os.path.splitext(output_basename[0])[0]
                    for idx, sp in enumerate(spectrum):
                        sp.to_csv(basename + '_{}.csv'.format(idx))

        # case that we have a single spectrum
        else:
            if args.output:
                output_path = args.output[0]
            else:
                # we are using the first name as a basename
                output_path = os.path.splitext(filenames[0])[0] + '.csv'

            spectrum.to_csv(output_path)
            print("Written {}".format(output_path)) 
Example 78
Project: transferlearning   Author: jindongwang   File: JDA.py    MIT License 4 votes vote down vote up
def fit_predict(self, Xs, Ys, Xt, Yt):
        '''
        Transform and Predict using 1NN as JDA paper did
        :param Xs: ns * n_feature, source feature
        :param Ys: ns * 1, source label
        :param Xt: nt * n_feature, target feature
        :param Yt: nt * 1, target label
        :return: acc, y_pred, list_acc
        '''
        list_acc = []
        X = np.hstack((Xs.T, Xt.T))
        X /= np.linalg.norm(X, axis=0)
        m, n = X.shape
        ns, nt = len(Xs), len(Xt)
        e = np.vstack((1 / ns * np.ones((ns, 1)), -1 / nt * np.ones((nt, 1))))
        C = len(np.unique(Ys))
        H = np.eye(n) - 1 / n * np.ones((n, n))

        M = 0
        Y_tar_pseudo = None
        for t in range(self.T):
            N = 0
            M0 = e * e.T * C
            if Y_tar_pseudo is not None and len(Y_tar_pseudo) == nt:
                for c in range(1, C + 1):
                    e = np.zeros((n, 1))
                    tt = Ys == c
                    e[np.where(tt == True)] = 1 / len(Ys[np.where(Ys == c)])
                    yy = Y_tar_pseudo == c
                    ind = np.where(yy == True)
                    inds = [item + ns for item in ind]
                    e[tuple(inds)] = -1 / len(Y_tar_pseudo[np.where(Y_tar_pseudo == c)])
                    e[np.isinf(e)] = 0
                    N = N + np.dot(e, e.T)
            M = M0 + N
            M = M / np.linalg.norm(M, 'fro')
            K = kernel(self.kernel_type, X, None, gamma=self.gamma)
            n_eye = m if self.kernel_type == 'primal' else n
            a, b = np.linalg.multi_dot([K, M, K.T]) + self.lamb * np.eye(n_eye), np.linalg.multi_dot([K, H, K.T])
            w, V = scipy.linalg.eig(a, b)
            ind = np.argsort(w)
            A = V[:, ind[:self.dim]]
            Z = np.dot(A.T, K)
            Z /= np.linalg.norm(Z, axis=0)
            Xs_new, Xt_new = Z[:, :ns].T, Z[:, ns:].T

            clf = KNeighborsClassifier(n_neighbors=1)
            clf.fit(Xs_new, Ys.ravel())
            Y_tar_pseudo = clf.predict(Xt_new)
            acc = sklearn.metrics.accuracy_score(Yt, Y_tar_pseudo)
            list_acc.append(acc)
            print('JDA iteration [{}/{}]: Acc: {:.4f}'.format(t + 1, self.T, acc))
        return acc, Y_tar_pseudo, list_acc 
Example 79
Project: transferlearning   Author: jindongwang   File: MEDA.py    MIT License 4 votes vote down vote up
def fit_predict(self, Xs, Ys, Xt, Yt):
        '''
        Transform and Predict
        :param Xs: ns * n_feature, source feature
        :param Ys: ns * 1, source label
        :param Xt: nt * n_feature, target feature
        :param Yt: nt * 1, target label
        :return: acc, y_pred, list_acc
        '''
        gfk = GFK.GFK(dim=self.dim)
        _, Xs_new, Xt_new = gfk.fit(Xs, Xt)
        Xs_new, Xt_new = Xs_new.T, Xt_new.T
        X = np.hstack((Xs_new, Xt_new))
        n, m = Xs_new.shape[1], Xt_new.shape[1]
        C = len(np.unique(Ys))
        list_acc = []
        YY = np.zeros((n, C))
        for c in range(1, C + 1):
            ind = np.where(Ys == c)
            YY[ind, c - 1] = 1
        YY = np.vstack((YY, np.zeros((m, C))))
        YY[0, 1:] = 0

        X /= np.linalg.norm(X, axis=0)
        L = 0  # Graph Laplacian is on the way...
        knn_clf = KNeighborsClassifier(n_neighbors=1)
        knn_clf.fit(X[:, :n].T, Ys.ravel())
        Cls = knn_clf.predict(X[:, n:].T)
        K = kernel(self.kernel_type, X, X2=None, gamma=self.gamma)
        E = np.diagflat(np.vstack((np.ones((n, 1)), np.zeros((m, 1)))))
        for t in range(1, self.T + 1):
            mu = self.estimate_mu(Xs_new.T, Ys, Xt_new.T, Cls)
            e = np.vstack((1 / n * np.ones((n, 1)), -1 / m * np.ones((m, 1))))
            M = e * e.T * C
            N = 0
            for c in range(1, C + 1):
                e = np.zeros((n + m, 1))
                tt = Ys == c
                e[np.where(tt == True)] = 1 / len(Ys[np.where(Ys == c)])
                yy = Cls == c
                ind = np.where(yy == True)
                inds = [item + n for item in ind]
                e[tuple(inds)] = -1 / len(Cls[np.where(Cls == c)])
                e[np.isinf(e)] = 0
                N += np.dot(e, e.T)
            M = (1 - mu) * M + mu * N
            M /= np.linalg.norm(M, 'fro')
            left = np.dot(E + self.lamb * M + self.rho * L, K) + self.eta * np.eye(n + m, n + m)
            Beta = np.dot(np.linalg.inv(left), np.dot(E, YY))
            F = np.dot(K, Beta)
            Cls = np.argmax(F, axis=1) + 1
            Cls = Cls[n:]
            acc = np.mean(Cls == Yt.ravel())
            list_acc.append(acc)
            print('MEDA iteration [{}/{}]: mu={:.2f}, Acc={:.4f}'.format(t, self.T, mu, acc))
        return acc, Cls, list_acc 
Example 80
Project: euclid   Author: njpayne   File: descriptive_stats.py    GNU General Public License v2.0 4 votes vote down vote up
def draw_histograms(data, headings, data_set):
    """
        Chart relationships between Variables
    """
    chart_categories = ['course_grade', 'Assig_1_full_40', 'Assig_2_full_40', 'Assig_3_full_40', 'proj_1_100', 'proj_2_100', 'proj_3_100', 'final_exam_100', 'peer_feedback_100', 'birth_country', 'residence_country', 'gender', 'age', 'primary_language', 'english_fluency', 'time_zone', 'occupation', 'highest_education', 'expected_hours_spent', 'formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS', 'qtr_proj1_confidence', 'qtr_proj2_confidence', 'qtr_piazza_opinion', 'qtr_peerfeedback_opinion', 'qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'CS6210_Completed', 'CS8803_Completed', 'CS6250_Completed', 'CS7641_Completed', 'CS6300_Completed', 'CS6310_Completed', 'CS4495_Completed', 'CS6475_Completed', 'CS6505_Completed', 'CS6290_Completed', 'CS8803_Completed', 'CS6440_Completed', 'mid_proj2_confidence', 'mid_proj3_confidence', 'mid_piazza_opinion', 'mid_peerfeedback_opinion', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_proj3_confidence', 'hours_spent', 'lessons_watched', 'exercises_completed', 'forum_visit_frequency', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person', 'watch_out_order', 'fall_behind', 'get_ahead', 'rewatch_full_lesson', 'rewatch_partial_lesson', 'view_answer_after_1incorrect', 'repeat_exercise_until_correct', 'skip_exercise', 'correct_first_attempt', 'access_from_mobile', 'download_videos', 'piazza_answers', 'piazza_days', 'piazza_asks', 'piazza_posts', 'piazza_views', 'total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views', 'lecture_1_pace', 'lecture_2_pace', 'lecture_3_pace', 'lecture_4_pace', 'lecture_5_pace', 'lecture_6_pace', 'lecture_7_pace', 'lecture_8_pace', 'lecture_9_pace', 'lecture_10_pace', 'lecture_11_pace', 'lecture_12_pace', 'lecture_13_pace', 'lecture_14_pace', 'lecture_15_pace', 'lecture_16_pace', 'lecture_17_pace', 'lecture_18_pace', 'lecture_19_pace', 'lecture_20_pace', 'lecture_21_pace', 'lecture_22_pace', 'lecture_23_pace', 'lecture_24_pace', 'lecture_25_pace', 'lecture_26_pace', 'overall_pace']
    #chart_categories = ["Age"]

    #create a folder for the dataset
    directory = os.path.dirname(os.path.join(os.getcwd(),"Results","Data Counts",data_set, ""))
    if not os.path.exists(directory):
        os.makedirs(directory)

    #convert to a pandas dataset
    pandas_data=pd.DataFrame(data = data, columns = headings)

    for chart_category in chart_categories:

        #get the slice
        index = np.argwhere(headings == chart_category)
        chart_column = data[ : , index[0][0]]


        #get counts

        plt.figure()
        plt.xlabel(chart_category)
        plt.ylabel("Count")
        plt.title("%s Count" % chart_category)

        try:
            #try converting to numbers
            chart_column = chart_column.astype(np.float)

            #create histogram
            hist, bin_edge = np.histogram(chart_column, 10)

            bin_middles = bin_edge[:-1] + np.diff(bin_edge)/2
        
            plt.hist(chart_column, 10, normed=False, histtype='bar', rwidth=0.8)

            pylab.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()

        except:
            #get unique values
            unique_categories, unique_counts = np.unique(chart_column, return_counts=True)

            sns_plot = sns.countplot(x=chart_category, data=pandas_data, palette="Greens_d");
            #plt.setp(sns_plot.get_xticklabels(), rotation=45)
            sns_plot.figure.savefig(os.path.join(os.getcwd(),"Results", "Data Counts",data_set, chart_category))

            plt.close()