Python pandas.unique() Examples

The following are code examples for showing how to use pandas.unique(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: gullikson-scripts   Author: kgullikson88   File: HDF5_Helpers.py    MIT License 6 votes vote down vote up
def list_stars(self, print2screen=False):
        """
        List all of the stars in all of the CCF interfaces

        Parameters:
        ===========
        - print2screen:     bool
                            Should we print the stars and dates to screen?

        Returns:
        =========
        - star_list:        list
                            A list of every star in the file, sorted by name.
        """
        stars = []
        for inst in self._interfaces.keys():
            if print2screen:
                print('Stars observed with {}: \n============================\n\n'.format(inst))
            stars.extend(self._interfaces[inst].list_stars(print2screen=print2screen))

        return list(pd.unique(stars)) 
Example 2
Project: wrangle   Author: autonomio   File: col_check_allsame.py    MIT License 6 votes vote down vote up
def col_check_allsame(data, col):

    '''Checks if all values in a column
    have the same value. This can be detrimental
    to a deep learning model.

    data : DataFrame
    col : str

    '''

    uniques = len(pd.unique(data[col]))

    if uniques == 1:
        return True
    else:
        return False 
Example 3
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 6 votes vote down vote up
def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype="m8[ns]")

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example 4
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 6 votes vote down vote up
def maskCategary(data_len,cat_data=None, mask=None):
    if mask is None:
        mask = (np.zeros((data_len,))==0)
    if cat_data is not None:
        cat_data[pd.isnull(cat_data)] = np.nan
        if cat_data.ndim==1:
            cat_data = cat_data.reshape((cat_data.shape[0],1))
        AllCats = [list(pd.unique(cat_data[mask,i])) for i in range(cat_data.shape[1])]
        AllCats = CartesianProduct(AllCats)
    else:
        AllCats = [(np.nan,)]
        cat_data = np.empty((data_len,1),dtype='float')+np.nan
    CatMask = {}
    for i,iCat in enumerate(AllCats):
        iMask = mask
        for j,jSubCat in enumerate(iCat):
            if pd.notnull(jSubCat):
                iMask = (iMask & (cat_data[:,j]==jSubCat))
            else:
                iMask = (iMask & pd.isnull(cat_data[:,j]))
        CatMask[tuple(iCat)] = iMask
    return CatMask
# 准备回归的数据 
Example 5
Project: QuantStudio   Author: Scorpi000   File: AuxiliaryFun.py    GNU General Public License v3.0 6 votes vote down vote up
def changeMultiClass2SingleClass(multi_class, sep=None):
    MultiClass = []
    for i in range(multi_class.shape[1]):
        MultiClass.append(pd.unique(multi_class[:,i]).tolist())
    MultiClass = CartesianProduct(MultiClass)
    SingleClassData = np.empty(shape=(multi_class.shape[0],),dtype="O")
    ClassDict = {}
    for i,iMultiClass in enumerate(MultiClass):
        iMask = np.array([True]*multi_class.shape[0])
        if sep is not None:
            iSingleClass = sep.join(map(str,iMultiClass))
        else:
            iSingleClass = str(i)
        for j,jSubClass in enumerate(iMultiClass):
            if pd.notnull(jSubClass):
                iMask = iMask & (multi_class[:,j]==jSubClass)
            else:
                iMask = iMask & pd.isnull(multi_class[:,j])
        SingleClassData[iMask] = iSingleClass
        ClassDict[iSingleClass] = iMultiClass
    return (SingleClassData,ClassDict)
# 给定某一分类subclass, 返回class_data的属于该类别的Mask, 如果subclass是None,返回全选的Mask
# subclass: [类别名称], 例如: ['银行', '大盘']
# class_data: 类别数据, DataFrame(columns=[分类名称]) 或者 array 
Example 6
Project: QuantStudio   Author: Scorpi000   File: WindDB.py    GNU General Public License v3.0 6 votes vote down vote up
def __QS_readData__(self, factor_names=None, ids=None, dts=None, args={}):
        if dts: StartDate, EndDate = dts[0].date(), dts[-1].date()
        else: StartDate, EndDate = None, None
        if factor_names is None: factor_names = self.FactorNames
        RawData = self._getRawData(factor_names, ids, start_date=StartDate, end_date=EndDate, args=args)
        if StartDate is None:
            StartDate = dt.datetime.strptime(np.min(RawData["纳入日期"].values), "%Y%m%d").date()
            DateSeries = getDateSeries(StartDate, dt.date.today())
        else:
            DateSeries = getDateSeries(dts[0].date(), dts[-1].date())
        Data = {}
        for iIndexID in factor_names:
            iRawData = RawData[RawData["指数ID"]==iIndexID].set_index(["ID"])
            iData = pd.DataFrame(0, index=DateSeries, columns=pd.unique(iRawData.index))
            for jID in iData.columns:
                jIDRawData = iRawData.loc[[jID]]
                for k in range(jIDRawData.shape[0]):
                    kStartDate = dt.datetime.strptime(jIDRawData["纳入日期"].iloc[k], "%Y%m%d").date()
                    kEndDate = (dt.datetime.strptime(jIDRawData["剔除日期"].iloc[k], "%Y%m%d").date()-dt.timedelta(1) if jIDRawData["剔除日期"].iloc[k] is not None else dt.date.today())
                    iData[jID].loc[kStartDate:kEndDate] = 1
            Data[iIndexID] = iData
        Data = pd.Panel(Data).loc[factor_names]
        Data.major_axis = [dt.datetime.combine(iDate, dt.time(0)) for iDate in Data.major_axis]
        Data.fillna(value=0, inplace=True)
        return adjustDateTime(Data, dts, fillna=True, method="bfill") 
Example 7
Project: QuantStudio   Author: Scorpi000   File: WindDB2.py    GNU General Public License v3.0 6 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        StartDate, EndDate = dts[0].date(), dts[-1].date()
        DateSeries = getDateSeries(StartDate, EndDate)
        Data = {}
        for iIndexID in factor_names:
            iRawData = raw_data[raw_data[self._GroupField]==iIndexID].set_index([self._IDField])
            iData = pd.DataFrame(0, index=DateSeries, columns=pd.unique(iRawData.index))
            for jID in iData.columns:
                jIDRawData = iRawData.loc[[jID]]
                for k in range(jIDRawData.shape[0]):
                    kStartDate = dt.datetime.strptime(jIDRawData[self._InDateField].iloc[k], "%Y%m%d").date()
                    kEndDate = (dt.datetime.strptime(jIDRawData[self._OutDateField].iloc[k], "%Y%m%d").date()-dt.timedelta(1) if jIDRawData[self._OutDateField].iloc[k] is not None else dt.date.today())
                    iData[jID].loc[kStartDate:kEndDate] = 1
            Data[iIndexID] = iData
        Data = pd.Panel(Data)
        if Data.minor_axis.intersection(ids).shape[0]==0: return pd.Panel(0.0, items=factor_names, major_axis=dts, minor_axis=ids)
        Data = Data.loc[factor_names, :, ids]
        Data.major_axis = [dt.datetime.combine(iDate, dt.time(0)) for iDate in Data.major_axis]
        Data.fillna(value=0, inplace=True)
        return adjustDateTime(Data, dts, fillna=True, method="bfill") 
Example 8
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 6 votes vote down vote up
def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example 9
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 6 votes vote down vote up
def test_first_nan_kept(self):
        # GH 22295
        # create different nans from bit-patterns:
        bits_for_nan1 = 0xfff8000000000001
        bits_for_nan2 = 0x7ff8000000000001
        NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
        NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
        assert NAN1 != NAN1
        assert NAN2 != NAN2
        for el_type in [np.float64, np.object]:
            a = np.array([NAN1, NAN2], dtype=el_type)
            result = pd.unique(a)
            assert result.size == 1
            # use bit patterns to identify which nan was kept:
            result_nan_bits = struct.unpack("=Q",
                                            struct.pack("d", result[0]))[0]
            assert result_nan_bits == bits_for_nan1 
Example 10
Project: gullikson-scripts   Author: kgullikson88   File: Sensitivity.py    MIT License 5 votes vote down vote up
def read_hdf5(hdf5_file):
    """
    Reads the hdf5 file into a dataframe. Assumes a very specific format!

    Parameters:
    ===========
    - hdf5_file:   string
                   The full path to the hdf5 file.

    Returns
    ========
    A pandas DataFrame containing summary information
    """
    logging.info('Reading HDF5 file {}'.format(hdf5_file))
    hdf5_int = HDF5_Interface(hdf5_file)
    df = hdf5_int.to_df()


    # Get the contrast. Split by group and then merge to limit the amount of calculation needed
    logging.info('Estimating the V-band contrast ratio for each trial')
    test_vsini = df.vsini.unique()[0]
    temp = df.loc[(df.rv == 0) & (df.vsini == test_vsini)].drop_duplicates(subset=['star', 'temperature'])
    temp['contrast'] = temp.apply(lambda r: get_contrast(r, band='V'), axis=1)

    logging.info('Estimating the luminosity ratio for each trial')
    temp['lum_ratio'] = temp.apply(get_luminosity_ratio, axis=1)

    logging.info('Re-merging dataframe')
    df = pd.merge(df, temp[['star', 'temperature', 'contrast', 'lum_ratio']], on=['star', 'temperature'], how='left')
    df['logL'] = np.log10(df.lum_ratio)

    return df 
Example 11
Project: gullikson-scripts   Author: kgullikson88   File: Sensitivity.py    MIT License 5 votes vote down vote up
def parse_input(inp, sort_output=True, ensure_unique=True):
    """
    Parse the user input to get a list of integers.

    Parameters:
    ===========
    - inp:           string
                     Can be in the form 'a-b', 'a,b,c', 'a-b,c-d', etc.
                     '-' means an inclusive list of every number between a and b
                     ',' means the numbers a and b

    - sort_output:   boolean
                     Sort the output integers?

    - ensure_unique: boolean
                     Make sure the final list has no repeats?
    :return: A list of integers
    """
    sublists = inp.split(',')
    final_list = []
    for l in sublists:
        if '-' in l:
            first, last = l.split('-')
            for i in range(int(first), int(last) + 1):
                final_list.append(i)
        else:
            final_list.append(int(l))

    if ensure_unique:
        final_list = pd.unique(final_list)
    if sort_output:
        final_list = sorted(final_list)
    return final_list 
Example 12
Project: gullikson-scripts   Author: kgullikson88   File: Analyze_CCF.py    MIT License 5 votes vote down vote up
def get_ccf(self, params, df=None):
        """
        Get the ccf with the given parameters.

        Parameters:
        ===========
        - params:    dictionary:
                     All the parameters necessary to define a single ccf. This should be
                     a python dictionary with the keys:
                         - 'starname': The name of the star. Try self.list_stars() for the options.
                         - 'date': The UT date of the observations. Try self.list_dates() for the options.
                         - 'T': temperature of the model
                         - 'logg': the log(g) of the model
                         - 'vsini': the vsini by which the model was broadened before correlation
                         - '[Fe/H]': the metallicity of the model
                         - 'addmode': The way the order CCFs were added to make a total one. Can be:
                             - 'simple'
                             - 'ml'
                             - 'weighted'
                             - 'dc'


        - df:        a pandas DataFrame such as outputted by _compile_data

        Returns:
        ========
        -ccf:        pandas DataFrame
                     Holds columns of velocity and CCF power
        """
        if df is None:
            try:
                df = self._compile_data(params['starname'], params['date'])
            except KeyError:
                raise KeyError('Must give get_ccf params with starname and date keywords, if df is not given!')

        Tvals = df['T'].unique()
        T = Tvals[np.argmin(abs(Tvals - params['T']))]
        good = df.loc[(df['T'] == T) & (df.logg == params['logg']) & (df.vsini == params['vsini']) \
                      & (df['[Fe/H]'] == params['[Fe/H]']) & (df.addmode == params['addmode'])]

        return pd.DataFrame(data={'velocity': self.velocities, 'CCF': good['ccf'].item()}) 
Example 13
Project: gullikson-scripts   Author: kgullikson88   File: CCF_Systematics.py    MIT License 5 votes vote down vote up
def add_actual_temperature(df, method='excel', filename='SecondaryStar_Temperatures.xls'):
    """
    Add the actual temperature to a given summary dataframe
    :param df: The dataframe to which we will add the actual secondary star temperature
    :keyword method: How to get the actual temperature. Options are:
                   - 'spt': Use main-sequence relationships to go from spectral type --> temperature
                   - 'excel': Use tabulated data, available in the file 'SecondaryStar_Temperatures.xls'
    :keyword filename: The filename of the excel spreadsheet containing the literature temperatures.
                       Needs to have the right format! Ignored if method='spt'
    :return: copy of the original dataframe, with an extra column for the secondary star temperature
    """
    # First, get a list of the secondary stars in the data
    secondary_names = pd.unique(df.Secondary)
    secondary_to_temperature = defaultdict(float)
    secondary_to_error = defaultdict(float)

    if method.lower() == 'spt':
        MS = SpectralTypeRelations.MainSequence()
        for secondary in secondary_names:
            star_data = StarData.GetData(secondary)
            spt = star_data.spectype[0] + re.search('[0-9]\.*[0-9]*', star_data.spectype).group()
            T_sec = MS.Interpolate(MS.Temperature, spt)
            secondary_to_temperature[secondary] = T_sec

    elif method.lower() == 'excel':
        table = pd.read_excel(filename, 0)
        for secondary in secondary_names:
            T_sec = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())]['Literature_Temp'].item()
            T_error = table.loc[table.Star.str.lower().str.contains(secondary.strip().lower())][
                'Literature_error'].item()
            secondary_to_temperature[secondary] = T_sec
            secondary_to_error[secondary] = T_error

    df['Tactual'] = df['Secondary'].map(lambda s: secondary_to_temperature[s])
    df['Tact_err'] = df['Secondary'].map(lambda s: secondary_to_error[s])
    return 
Example 14
Project: gullikson-scripts   Author: kgullikson88   File: CCF_Systematics.py    MIT License 5 votes vote down vote up
def fit_sigma(df, i):
    """
    Find the largest allowable standard deviation, given the possible values Tactual can take.
    """
    Tmeasured, Tactual, _, _ = get_values(df)
    Tm = Tmeasured[i]
    
    # Get the possible values, and bin those with this measured value
    possible_values = sorted(pd.unique(df.Tactual))
    edges = [(possible_values[i] + possible_values[i+1])/2 for i in range(len(possible_values)-1)]
    bins = [0] + edges + [9e9]
    good = df.loc[df.Temperature == Tm]
    values, _= np.histogram(good.Tactual.values, bins=bins)
    
    mean = np.mean(good.Tactual.values)
    std = np.std(good.Tactual.values, ddof=1)
    if std > 0:
        return std
    
    sigma_test = np.arange(500, 10, -10) #Just test a bunch of values
    idx = np.searchsorted(bins, mean)
    idx = np.argmin(abs(np.array(bins) - mean))
    x1 = bins[idx-2] if idx > 2 else -1
    x2 = bins[idx-1]
    x3 = bins[idx]
    x4 = bins[idx+1] if idx < len(bins)-2 else np.inf
    N = len(good)
    probs = [get_probability(x1, x2, x3, x4, N, mean, s) for s in sigma_test]
    for s, p in zip(sigma_test, probs):
        if p > 0.5:
            return s
    
    # If we get here, just return a guess value
    return 200.0

    #raise ValueError('No probability > 0!') 
Example 15
Project: ml_stuff   Author: chrisranderson   File: data_prep.py    MIT License 5 votes vote down vote up
def nominal_to_numeric(array):
  mapper = {name: i for i, name in enumerate(pd.unique(array))}
  return np.array([mapper[name] for name in array]) 
Example 16
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_numpy.py    MIT License 5 votes vote down vote up
def data_for_grouping(allow_in_pandas, dtype):
    """Data for factorization, grouping, and unique tests.

    Expected to be like [B, B, NA, NA, A, A, B, C]

    Where A < B < C and NA is missing
    """
    if dtype.numpy_dtype == "object":
        a, b, c = (1,), (2,), (3,)
    else:
        a, b, c = np.arange(3)
    return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c])) 
Example 17
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_cut.py    MIT License 5 votes vote down vote up
def test_cut_duplicates_bin(kwargs, msg):
    # see gh-20947
    bins = [0, 2, 4, 6, 10, 10]
    values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])

    if msg is not None:
        with pytest.raises(ValueError, match=msg):
            cut(values, bins, **kwargs)
    else:
        result = cut(values, bins, **kwargs)
        expected = cut(values, pd.unique(bins))
        tm.assert_series_equal(result, expected) 
Example 18
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray) 
Example 19
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype("O")

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray) 
Example 20
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_object_refcount_bug(self):
        lst = ["A", "B", "C", "D", "E"]
        for i in range(1000):
            len(algos.unique(lst)) 
Example 21
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np_array_datetime64_compat(
            [
                "2015-01-03T00:00:00.000000000+0000",
                "2015-01-01T00:00:00.000000000+0000",
            ],
            dtype="M8[ns]",
        )

        dt_index = pd.to_datetime(
            [
                "2015-01-03T00:00:00.000000000",
                "2015-01-01T00:00:00.000000000",
                "2015-01-01T00:00:00.000000000",
            ]
        )
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        s = Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        assert result.dtype == expected.dtype 
Example 22
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_uint64_overflow(self):
        s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64)
        exp = np.array([1, 2, 2 ** 63], dtype=np.uint64)
        tm.assert_numpy_array_equal(algos.unique(s), exp) 
Example 23
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_nan_in_object_array(self):
        duplicated_items = ["a", np.nan, "c", "c"]
        result = pd.unique(duplicated_items)
        expected = np.array(["a", np.nan, "c"], dtype=object)
        tm.assert_numpy_array_equal(result, expected) 
Example 24
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_categorical(self):

        # we are expecting to return in the order
        # of appearance
        expected = Categorical(list("bac"), categories=list("bac"))

        # we are expecting to return in the order
        # of the categories
        expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)

        # GH 15939
        c = Categorical(list("baabc"))
        result = c.unique()
        tm.assert_categorical_equal(result, expected)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected)

        c = Categorical(list("baabc"), ordered=True)
        result = c.unique()
        tm.assert_categorical_equal(result, expected_o)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected_o)

        # Series of categorical dtype
        s = Series(Categorical(list("baabc")), name="foo")
        result = s.unique()
        tm.assert_categorical_equal(result, expected)

        result = pd.unique(s)
        tm.assert_categorical_equal(result, expected)

        # CI -> return CI
        ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac")))
        expected = CategoricalIndex(expected)
        result = ci.unique()
        tm.assert_index_equal(result, expected)

        result = pd.unique(ci)
        tm.assert_index_equal(result, expected) 
Example 25
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_order_of_appearance(self):
        # 9346
        # light testing of guarantee of order of appearance
        # these also are the doc-examples
        result = pd.unique(Series([2, 1, 3, 3]))
        tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))

        result = pd.unique(Series([2] + [1] * 5))
        tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))

        result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")]))
        expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]")
        tm.assert_numpy_array_equal(result, expected)

        result = pd.unique(
            Index(
                [
                    Timestamp("20160101", tz="US/Eastern"),
                    Timestamp("20160101", tz="US/Eastern"),
                ]
            )
        )
        expected = DatetimeIndex(
            ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
        )
        tm.assert_index_equal(result, expected)

        result = pd.unique(list("aabc"))
        expected = np.array(["a", "b", "c"], dtype=object)
        tm.assert_numpy_array_equal(result, expected)

        result = pd.unique(Series(Categorical(list("aabc"))))
        expected = Categorical(list("abc"))
        tm.assert_categorical_equal(result, expected) 
Example 26
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_tuple_with_strings(self, arg, expected):
        # see GH 17108
        result = pd.unique(arg)
        tm.assert_numpy_array_equal(result, expected) 
Example 27
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_obj_none_preservation(self):
        # GH 20866
        arr = np.array(["foo", None], dtype=object)
        result = pd.unique(arr)
        expected = np.array(["foo", None], dtype=object)

        tm.assert_numpy_array_equal(result, expected, strict_nan=True) 
Example 28
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_signed_zero(self):
        # GH 21866
        a = np.array([-0.0, 0.0])
        result = pd.unique(a)
        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
        tm.assert_numpy_array_equal(result, expected) 
Example 29
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_different_nans(self):
        # GH 21866
        # create different nans from bit-patterns:
        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
        assert NAN1 != NAN1
        assert NAN2 != NAN2
        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
        result = pd.unique(a)
        expected = np.array([np.nan])
        tm.assert_numpy_array_equal(result, expected) 
Example 30
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
        # GH 22295
        if unique_nulls_fixture is unique_nulls_fixture2:
            return  # skip it, values not unique
        a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object)
        result = pd.unique(a)
        assert result.size == 2
        assert a[0] is unique_nulls_fixture
        assert a[1] is unique_nulls_fixture2 
Example 31
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected) 
Example 32
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_get_unique(self):
        s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64)
        exp = np.array([1, 2, 2 ** 63], dtype=np.uint64)
        tm.assert_numpy_array_equal(s.unique(), exp) 
Example 33
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_hashtable_unique(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, "make" + tm_dtype + "Index")
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.unique()
        expected_unique = s_duplicated.drop_duplicates(keep="first").values
        result_unique = htable().unique(s_duplicated.values)
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # test return_inverse=True
        # reconstruction can only succeed if the inverse is correct
        result_unique, result_inverse = htable().unique(
            s_duplicated.values, return_inverse=True
        )
        tm.assert_numpy_array_equal(result_unique, expected_unique)
        reconstr = result_unique[result_inverse]
        tm.assert_numpy_array_equal(reconstr, s_duplicated.values) 
Example 34
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_algos.py    MIT License 5 votes vote down vote up
def test_hashtable_factorize(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, "make" + tm_dtype + "Index")
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)
        na_mask = s_duplicated.isna().values

        result_unique, result_inverse = htable().factorize(s_duplicated.values)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.factorize()
        # since factorize removes all NaNs, we do the same here
        expected_unique = s_duplicated.dropna().drop_duplicates().values
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # reconstruction can only succeed if the inverse is correct. Since
        # factorize removes the NaNs, those have to be excluded here as well
        result_reconstruct = result_unique[result_inverse[~na_mask]]
        expected_reconstruct = s_duplicated.dropna().values
        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) 
Example 35
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: numpy_.py    MIT License 5 votes vote down vote up
def unique(self):
        from pandas import unique

        return type(self)(unique(self._ndarray))

    # ------------------------------------------------------------------------
    # Reductions 
Example 36
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: base.py    MIT License 5 votes vote down vote up
def unique(self):
        """
        Compute the ExtensionArray of unique values.

        Returns
        -------
        uniques : ExtensionArray
        """
        from pandas import unique

        uniques = unique(self.astype(object))
        return self._from_sequence(uniques, dtype=self.dtype) 
Example 37
Project: variational-autoencoders-for-collaborative-filtering-pytorch   Author: cydonia999   File: create_lastfm.py    MIT License 5 votes vote down vote up
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    assert pd.unique(tp_tr["uid"]).shape[0] == end_idx - start_idx + 1
    assert pd.unique(tp_te["uid"]).shape[0] == end_idx - start_idx + 1

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te), (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te 
Example 38
Project: variational-autoencoders-for-collaborative-filtering-pytorch   Author: cydonia999   File: create_ml.py    MIT License 5 votes vote down vote up
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())
    assert pd.unique(tp_tr["uid"]).shape[0] == end_idx - start_idx + 1
    assert pd.unique(tp_te["uid"]).shape[0] == end_idx - start_idx + 1

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr), (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te), (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te 
Example 39
Project: QuantStudio   Author: Scorpi000   File: BrinsonModel.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_move__(self, idt, **kwargs):
        if self._iDT==idt: return 0
        self._iDT = idt
        PreDT = None
        if self.CalcDTs:
            if idt not in self.CalcDTs[self._CurCalcInd:]: return 0
            self._CurCalcInd = self.CalcDTs[self._CurCalcInd:].index(idt) + self._CurCalcInd
            if self._CurCalcInd>0: PreDT = self.CalcDTs[self._CurCalcInd - 1]
        else:
            self._CurCalcInd = self._Model.DateTimeIndex
            if self._CurCalcInd>0: PreDT = self._Model.DateTimeSeries[self._CurCalcInd - 1]
        if PreDT is None: return 0
        Portfolio = self._FactorTable.readData(factor_names=[self.Portfolio, self.BenchmarkPortfolio], dts=[PreDT], ids=self._IDs).iloc[:, 0, :]
        BenchmarkPortfolio, Portfolio = Portfolio.iloc[:, 1], Portfolio.iloc[:, 0]
        Portfolio[pd.isnull(Portfolio)], BenchmarkPortfolio[pd.isnull(BenchmarkPortfolio)] = 0.0, 0.0
        Price = self._FactorTable.readData(factor_names=[self.PriceFactor], dts=[PreDT, idt], ids=self._IDs).iloc[0]
        Return = Price.iloc[1] / Price.iloc[0] - 1
        Return[pd.isnull(Return)] = 0.0
        GroupData = self._FactorTable.readData(factor_names=[self.GroupFactor], ids=self._IDs, dts=[PreDT]).iloc[0, 0, :]
        AllGroups = pd.unique(GroupData[pd.notnull(GroupData)].values).tolist()
        if GroupData.hasnans: AllGroups.append(None)
        for iGroup in AllGroups:
            if iGroup is None: iMask = pd.isnull(GroupData)
            else: iMask = (GroupData==iGroup)
            iGroup = str(iGroup)
            iPortfolio, iBenchmarkPortfolio = Portfolio[iMask], BenchmarkPortfolio[iMask]
            iGroupWeight, iBenchmarkGroupWeight = iPortfolio.sum(), iBenchmarkPortfolio.sum()
            self._Output["策略组合资产权重"].loc[idt, iGroup] = iGroupWeight
            self._Output["基准组合资产权重"].loc[idt, iGroup] = iBenchmarkGroupWeight
            self._Output["策略组合资产收益"].loc[idt, iGroup] = ((iPortfolio * Return[iMask]).sum() / iGroupWeight if iGroupWeight!=0 else 0.0)
            self._Output["基准组合资产收益"].loc[idt, iGroup] = ((iBenchmarkPortfolio * Return[iMask]).sum() / iBenchmarkGroupWeight if iBenchmarkGroupWeight!=0 else 0.0)
        self._Output["策略组合资产权重"].loc[idt, "现金"] = 1 - self._Output["策略组合资产权重"].loc[idt].iloc[1:].sum()
        self._Output["基准组合资产权重"].loc[idt, "现金"] = 1 - self._Output["基准组合资产权重"].loc[idt].iloc[1:].sum()
        return 0 
Example 40
Project: QuantStudio   Author: Scorpi000   File: Distribution.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_start__(self, mdl, dts, **kwargs):
        if self._isStarted: return ()
        super().__QS_start__(mdl=mdl, dts=dts, **kwargs)
        AllIndustries = pd.unique(self._FactorTable.readData(factor_names=[self.IndustryFactor], dts=self._FactorTable.getDateTime(ifactor_name=self.IndustryFactor), ids=self._FactorTable.getID(ifactor_name=self.IndustryFactor)).iloc[0].values.flatten())
        Mask = pd.isnull(AllIndustries)
        if np.sum(Mask)>0: AllIndustries = AllIndustries[~Mask].tolist()+[None]
        self._Output = {iFactorName:{iIndustry:[] for iIndustry in AllIndustries} for iFactorName in self.TestFactors}
        self._Output["历史平均值"] = {iFactorName:[] for iFactorName in self.TestFactors}
        self._Output["历史标准差"] = {iFactorName:[] for iFactorName in self.TestFactors}
        self._Output["行业分类"] = AllIndustries
        self._Output["时点"] = []
        self._CurCalcInd = 0
        return (self._FactorTable, ) 
Example 41
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 5 votes vote down vote up
def standardizeRank(data, mask=None, cat_data=None, ascending=True, uniformization=True, perturbation=False, offset=0.5, other_handle='填充None'):
    """Rank 标准化"""
    if other_handle=="保持不变":
        StdData = np.copy(data)
    else:
        StdData = np.empty(data.shape,dtype='float')+np.nan
    if mask is None:
        mask = pd.isnull(StdData)
    if perturbation:
        UniqueData = data[pd.notnull(data)]
        if UniqueData.shape[0]>0:
            UniqueData = np.sort(pd.unique(UniqueData))
            MinDiff = np.min(np.abs(np.diff(UniqueData)))
            data = data+np.random.rand(data.shape[0])*MinDiff*0.01
    CatMasks = maskCategary(data.shape[0],cat_data=cat_data,mask=mask)
    for jCat,jCatMask in CatMasks.items():
        jData = data[jCatMask]
        jNotNaMask = pd.notnull(jData)
        if ascending:
            jRank = np.argsort(np.argsort(jData[jNotNaMask]))
        else:
            jRank = np.argsort(np.argsort(-jData[jNotNaMask]))
        if uniformization:
            jRank = (jRank.astype('float')+offset)/jRank.shape[0]
        else:
            jRank = jRank.astype('float')
        jData[jNotNaMask] = jRank
        StdData[jCatMask] = jData
    return StdData

# 分位数变换(Quantile Transformation)标准化
# data: 待标准化的数据, array; cat_data: 分类数据, array
# ascending: 是否升序, 可选: True, False 
Example 42
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 5 votes vote down vote up
def _calcMultiMappingData(self, raw_data, factor_names, ids, dts, args={}):
        Data, nDT, nFactor = {}, len(dts), len(factor_names)
        raw_data.set_index(["ID"], inplace=True)
        raw_data["QS_结束日"] = raw_data["QS_结束日"].where(pd.notnull(raw_data["QS_结束日"]), dts[-1]+dt.timedelta(1))
        if args.get("只填起始日", self.OnlyStartFilled):
            raw_data["QS_起始日"] = raw_data["QS_起始日"].where(raw_data["QS_起始日"]>=dts[0], dts[0])
            for iID in raw_data.index.unique():
                iRawData = raw_data.loc[[iID]].set_index(["QS_起始日"])
                iData = pd.DataFrame([([],)*nFactor]*nDT, index=dts, columns=factor_names, dtype="O")
                for jStartDate in iRawData.index.drop_duplicates():
                    iData.iloc[iData.index.searchsorted(jStartDate)] += pd.Series(iRawData.loc[[jStartDate], factor_names].values.T.tolist(), index=factor_names)
                Data[iID] = iData
            return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids]
        else:
            DeltaDT = dt.timedelta(int(not self._EndDateIncluded))
            for iID in raw_data.index.unique():
                iRawData = raw_data.loc[[iID]].set_index(["QS_起始日", "QS_结束日"])
                iData = pd.DataFrame([([],)*nFactor]*nDT, index=dts, columns=factor_names, dtype="O")
                for jStartDate, jEndDate in iRawData.index.drop_duplicates():
                    ijRawData = iRawData.loc[jStartDate].loc[[jEndDate], factor_names].values.T.tolist()
                    if pd.isnull(jEndDate) or (jEndDate<jStartDate):
                        ijOldData = iData.loc[jStartDate:]
                        iData.loc[jStartDate:] += pd.DataFrame([ijRawData] * ijOldData.shape[0], index=ijOldData.index, columns=ijOldData.columns, dtype="O")
                    else:
                        jEndDate -= DeltaDT
                        ijOldData = iData.loc[jStartDate:jEndDate]
                        iData.loc[jStartDate:jEndDate] += pd.DataFrame([ijRawData] * ijOldData.shape[0], index=ijOldData.index, columns=ijOldData.columns, dtype="O")
                Data[iID] = iData
            return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids] 
Example 43
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        if raw_data.shape[0]==0: return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids)
        if args.get("多重映射", self.MultiMapping): return self._calcMultiMappingData(raw_data, factor_names, ids, dts, args=args)
        raw_data.set_index(["ID"], inplace=True)
        Data, nFactor = {}, len(factor_names)
        if args.get("只填起始日", self.OnlyStartFilled):
            raw_data["QS_起始日"] = raw_data["QS_起始日"].where(raw_data["QS_起始日"]>=dts[0], dts[0])
            for iID in raw_data.index.unique():
                iRawData = raw_data.loc[[iID]].set_index(["QS_起始日"])
                iData = pd.DataFrame(index=dts, columns=factor_names)
                for jStartDate in iRawData.index:
                    iData.iloc[iData.index.searchsorted(jStartDate)] = iRawData.loc[jStartDate, factor_names]
                Data[iID] = iData
            return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids]
        else:
            DeltaDT = dt.timedelta(int(not self._EndDateIncluded))
            for iID in raw_data.index.unique():
                iRawData = raw_data.loc[[iID]]
                iData = pd.DataFrame(index=dts, columns=factor_names)
                for j in range(iRawData.shape[0]):
                    ijRawData = iRawData.iloc[j]
                    jStartDate, jEndDate = ijRawData["QS_起始日"], ijRawData["QS_结束日"]
                    if pd.isnull(jEndDate) or (jEndDate<jStartDate):
                        iData.loc[jStartDate:] = np.repeat(ijRawData[factor_names].values.reshape((1, nFactor)), iData.loc[jStartDate:].shape[0], axis=0)
                    else:
                        jEndDate -= DeltaDT
                        iData.loc[jStartDate:jEndDate] = np.repeat(ijRawData[factor_names].values.reshape((1, nFactor)), iData.loc[jStartDate:jEndDate].shape[0], axis=0)
                Data[iID] = iData
            return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids] 
Example 44
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        StartDate, EndDate = dts[0].date(), dts[-1].date()
        DateSeries = getDateSeries(StartDate, EndDate)
        Data = {}
        for iIndexID in factor_names:
            iRawData = raw_data[raw_data["IndexID"]==int(iIndexID)].set_index(["SecurityID"])
            iData = pd.DataFrame(0, index=DateSeries, columns=pd.unique(iRawData.index))
            for jID in iData.columns:
                jIDRawData = iRawData.loc[[jID]]
                for k in range(jIDRawData.shape[0]):
                    kStartDate = jIDRawData["InDate"].iloc[k].date()
                    kEndDate = (jIDRawData["OutDate"].iloc[k].date()-dt.timedelta(1) if jIDRawData["OutDate"].iloc[k] is not None else dt.date.today())
                    iData[jID].loc[kStartDate:kEndDate] = 1
            Data[iIndexID] = iData
        Data = pd.Panel(Data)
        if Data.minor_axis.intersection(ids).shape[0]==0: return pd.Panel(0.0, items=factor_names, major_axis=dts, minor_axis=ids)
        Data = Data.loc[factor_names, :, ids]
        Data.major_axis = [dt.datetime.combine(iDate, dt.time(0)) for iDate in Data.major_axis]
        Data.fillna(value=0, inplace=True)
        return adjustDateTime(Data, dts, fillna=True, method="bfill")


# 行情因子表, 表结构特征:
# 日期字段, 表示数据填充的时点;
# 条件字段, 作为条件过滤记录; 可能存在多个条件字段
# 在设定某些条件下, 数据填充时点和 ID 可以唯一标志一行记录
# 先填充表中已有的数据, 然后根据回溯天数参数填充缺失的时点 
Example 45
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        if raw_data.shape[0]==0: return pd.Panel(np.nan, items=factor_names, major_axis=dts, minor_axis=ids)
        Dates = sorted({iDT.strftime("%Y%m%d") for iDT in dts})
        CalcType, LookBack = args.get("计算方法", self.CalcType), args.get("回溯天数", self.LookBack)
        if CalcType=="Fwd12M":
            CalcFun, FYNum, ANNReportData = self._calcIDData_Fwd12M, None, None
        else:
            CalcFun, FYNum = self._calcIDData_FY, int(CalcType[-1])
            ANNReportPath = raw_data.columns.name
            if (ANNReportPath is not None) and os.path.isfile(ANNReportPath+("."+self._ANN_ReportFileSuffix if self._ANN_ReportFileSuffix else "")):
                with shelve.open(ANNReportPath) as ANN_ReportFile:
                    ANNReportData = ANN_ReportFile["RawData"]
            else:
                ANNReportData = _prepareReportANNRawData(self._FactorDB, ids)
            ANNReportData = ANNReportData.set_index(["ID"])
        raw_data = raw_data.set_index(["ID"])
        Data = {}
        for iID in raw_data.index.unique():
            if ANNReportData is not None:
                if iID in ANNReportData.index:
                    iANNReportData = ANNReportData.loc[[iID]]
                else:
                    continue
            else:
                iANNReportData = None
            Data[iID] = CalcFun(Dates, raw_data.loc[[iID]], iANNReportData, factor_names, LookBack, FYNum)
        Data = pd.Panel(Data, minor_axis=factor_names)
        Data.major_axis = [dt.datetime.strptime(iDate, "%Y%m%d") for iDate in Dates]
        Data = Data.swapaxes(0, 2)
        return self._adjustDataDTID(Data, LookBack, factor_names, ids, dts) 
Example 46
Project: QuantStudio   Author: Scorpi000   File: WindDB2.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        if raw_data.shape[0]==0: return pd.Panel(items=factor_names, major_axis=dts, minor_axis=ids)
        raw_data[self._EndDateField] = raw_data[self._EndDateField].where(pd.notnull(raw_data[self._EndDateField]), dt.date.today().strftime("%Y%m%d"))
        raw_data.set_index(["ID"], inplace=True)
        DeltaDT = dt.timedelta(int(not self._EndDateIncluded))
        Data, nFactor = {}, len(factor_names)
        for iID in raw_data.index.unique():
            iRawData = raw_data.loc[[iID]]
            iData = pd.DataFrame(index=dts, columns=factor_names)
            for j in range(iRawData.shape[0]):
                ijRawData = iRawData.iloc[j]
                jStartDate, jEndDate = dt.datetime.strptime(ijRawData[self._StartDateField], "%Y%m%d"), dt.datetime.strptime(ijRawData[self._EndDateField], "%Y%m%d")-DeltaDT
                iData.loc[jStartDate:jEndDate] = np.repeat(ijRawData[factor_names].values.reshape((1, nFactor)), iData.loc[jStartDate:jEndDate].shape[0], axis=0)
            Data[iID] = iData
        return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids] 
Example 47
Project: QuantStudio   Author: Scorpi000   File: WindDB2.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        if raw_data.shape[0]==0: return pd.Panel(np.full(shape=(len(factor_names), len(dts), len(ids)), fill_value=None, dtype="O"), items=factor_names, major_axis=dts, minor_axis=ids)
        raw_data[self._OutDateField] = raw_data[self._OutDateField].where(pd.notnull(raw_data[self._OutDateField]), dt.date.today().strftime("%Y%m%d"))
        raw_data.set_index(["ID"], inplace=True)
        DeltaDT = dt.timedelta(int(not self._OutDateIncluded))
        Data, nFactor = {}, len(factor_names)
        for iID in raw_data.index.unique():
            iRawData = raw_data.loc[[iID]]
            iData = pd.DataFrame(index=dts, columns=factor_names, dtype="O")
            for j in range(iRawData.shape[0]):
                ijRawData = iRawData.iloc[j]
                jStartDate, jEndDate = dt.datetime.strptime(ijRawData[self._InDateField], "%Y%m%d"), dt.datetime.strptime(ijRawData[self._OutDateField], "%Y%m%d")-DeltaDT
                iData.loc[jStartDate:jEndDate] = np.repeat(ijRawData[factor_names].values.reshape((1, nFactor)), iData.loc[jStartDate:jEndDate].shape[0], axis=0)
            Data[iID] = iData
        return pd.Panel(Data).swapaxes(0, 2).loc[:, :, ids] 
Example 48
Project: QuantStudio   Author: Scorpi000   File: WindDB2.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_calcData__(self, raw_data, factor_names, ids, dts, args={}):
        if raw_data.shape[0]==0: return pd.Panel(np.nan, items=factor_names, major_axis=dts, minor_axis=ids)
        Dates = sorted({iDT.strftime("%Y%m%d") for iDT in dts})
        CalcType, LookBack = args.get("计算方法", self.CalcType), args.get("回溯天数", self.LookBack)
        if CalcType=="Fwd12M":
            CalcFun, FYNum, ANNReportData = self._calcIDData_Fwd12M, None, None
        else:
            CalcFun, FYNum = self._calcIDData_FY, int(CalcType[-1])
            ANNReportPath = raw_data.columns.name
            if (ANNReportPath is not None) and os.path.isfile(ANNReportPath+("."+self._ANN_ReportFileSuffix if self._ANN_ReportFileSuffix else "")):
                with shelve.open(ANNReportPath) as ANN_ReportFile:
                    ANNReportData = ANN_ReportFile["RawData"]
            else:
                ANNReportData = _prepareReportANNRawData(self._FactorDB, ids)
            ANNReportData = ANNReportData.set_index(["ID"])
        raw_data = raw_data.set_index(["ID"])
        Data = {}
        for iID in raw_data.index.unique():
            if ANNReportData is not None:
                if iID in ANNReportData.index:
                    iANNReportData = ANNReportData.loc[[iID]]
                else:
                    continue
            else:
                iANNReportData = None
            Data[iID] = CalcFun(Dates, raw_data.loc[[iID]], iANNReportData, factor_names, LookBack, FYNum)
        Data = pd.Panel(Data, minor_axis=factor_names)
        Data.major_axis = [dt.datetime.strptime(iDate, "%Y%m%d") for iDate in Dates]
        Data = Data.swapaxes(0, 2)
        if LookBack==0: return Data.loc[:, dts, ids]
        AllDTs = Data.major_axis.union(set(dts)).sort_values()
        Data = Data.loc[:, AllDTs, ids]
        Limits = LookBack*24.0*3600
        for i, iFactorName in enumerate(Data.items):
            Data.iloc[i] = fillNaByLookback(Data.iloc[i], lookback=Limits)
        return Data.loc[:, dts] 
Example 49
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggregate(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nID = len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    AggrFun = args["OperatorArg"]["aggr_fun"]
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nID, ), fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                Rslt[i] = AggrFun(FactorData[iMask])
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                Rslt[iMask] = AggrFun(FactorData[iMask])
    else:
        Rslt = np.full(shape=(nID, ), fill_value=AggrFun(FactorData[Mask]))
    return Rslt 
Example 50
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_sum(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                Rslt[:, i] = np.nansum(iMask * FactorData, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                Rslt[iMask] = np.nansum(iMask * FactorData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        Rslt = np.nansum(FactorData * Mask, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 51
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_prod(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanprod(iData, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            iData = np.full(shape=FactorData.shape, fill_value=np.nan)
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanprod(iData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanprod(FactorData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 52
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_max(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if FactorData.shape[1]==0: return np.full(shape=(nDT, nID), fill_value=np.nan)
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanmax(iData, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanmax(iData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanmax(FactorData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 53
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_mean(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["Weight"]:
        WeightData = Data[2-args["OperatorArg"]["Mask"]]
    else:
        WeightData = np.ones(FactorData.shape)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                Rslt[:, i] = np.nansum(iMask * WeightData * FactorData, axis=1) / np.nansum(iMask * WeightData, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                Rslt[iMask] = (np.nansum(iMask * WeightData * FactorData, axis=1) / np.nansum(iMask * WeightData, axis=1)).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        Rslt = (np.nansum(FactorData * WeightData * Mask, axis=1) / np.nansum(WeightData * Mask, axis=1)).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 54
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_std(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanstd(iData, axis=1, ddof=args["OperatorArg"]["ddof"])
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanstd(iData, axis=1, ddof=args["OperatorArg"]["ddof"]).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanstd(FactorData, axis=1, ddof=args["OperatorArg"]["ddof"]).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 55
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_var(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanvar(iData, axis=1, ddof=args["OperatorArg"]["ddof"])
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanvar(iData, axis=1, ddof=args["OperatorArg"]["ddof"]).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanvar(FactorData, axis=1, ddof=args["OperatorArg"]["ddof"]).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 56
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_median(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanmedian(iData, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanmedian(iData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanmedian(FactorData, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 57
Project: QuantStudio   Author: Scorpi000   File: FactorTools.py    GNU General Public License v3.0 5 votes vote down vote up
def _aggr_quantile(f,idt,iid,x,args):
    Data = _genOperatorData(f,idt,iid,x,args)
    nDT, nID = len(idt), len(iid)
    FactorData = Data[0]
    if args["OperatorArg"]["Mask"]:
        Mask = (Data[1]==1)
    else:
        Mask = np.full(FactorData.shape, fill_value=True)
    if args["OperatorArg"]["CatData"]:
        CatData = Data[-1]
        Rslt = np.full(shape=(nDT, nID), fill_value=np.nan)
        iData = np.full(shape=FactorData.shape, fill_value=np.nan)
        if args["OperatorArg"]["SectionChged"]:
            for i, iID in enumerate(iid):
                iMask = ((CatData==iID) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[:, i] = np.nanpercentile(iData, q=args["OperatorArg"]["quantile"]*100, axis=1)
        else:
            AllCats = pd.unique(CatData.flatten())
            for i, iCat in enumerate(AllCats):
                if pd.isnull(iCat):
                    iMask = (pd.isnull(CatData) & Mask)
                else:
                    iMask = ((CatData==iCat) & Mask)
                iData[:] = np.nan
                iData[iMask] = FactorData[iMask]
                Rslt[iMask] = np.nanpercentile(iData, q=args["OperatorArg"]["quantile"]*100, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)[iMask]
    else:
        FactorData[~Mask] = np.nan
        Rslt = np.nanpercentile(FactorData, q=args["OperatorArg"]["quantile"]*100, axis=1).reshape((nDT, 1)).repeat(nID, axis=1)
    return Rslt 
Example 58
Project: recruit   Author: Frank-qlu   File: test_cut.py    Apache License 2.0 5 votes vote down vote up
def test_cut_duplicates_bin(kwargs, msg):
    # see gh-20947
    bins = [0, 2, 4, 6, 10, 10]
    values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])

    if msg is not None:
        with pytest.raises(ValueError, match=msg):
            cut(values, bins, **kwargs)
    else:
        result = cut(values, bins, **kwargs)
        expected = cut(values, pd.unique(bins))
        tm.assert_series_equal(result, expected) 
Example 59
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray) 
Example 60
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        assert isinstance(result, np.ndarray) 
Example 61
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_object_refcount_bug(self):
        lst = ['A', 'B', 'C', 'D', 'E']
        for i in range(1000):
            len(algos.unique(lst)) 
Example 62
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_on_index_object(self):

        mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
            np.arange(5), 5)])
        expected = mindex.values
        expected.sort()

        mindex = mindex.repeat(2)

        result = pd.unique(mindex)
        result.sort()

        tm.assert_almost_equal(result, expected) 
Example 63
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_uint64_overflow(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(algos.unique(s), exp) 
Example 64
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_nan_in_object_array(self):
        duplicated_items = ['a', np.nan, 'c', 'c']
        result = pd.unique(duplicated_items)
        expected = np.array(['a', np.nan, 'c'], dtype=object)
        tm.assert_numpy_array_equal(result, expected) 
Example 65
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_datetime64tz_aware(self):
        # GH 15939

        result = Series(
            Index([Timestamp('20160101', tz='US/Eastern'),
                   Timestamp('20160101', tz='US/Eastern')])).unique()
        expected = DatetimeArray._from_sequence(np.array([
            Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
        ]))
        tm.assert_extension_array_equal(result, expected)

        result = Index([Timestamp('20160101', tz='US/Eastern'),
                        Timestamp('20160101', tz='US/Eastern')]).unique()
        expected = DatetimeIndex(['2016-01-01 00:00:00'],
                                 dtype='datetime64[ns, US/Eastern]', freq=None)
        tm.assert_index_equal(result, expected)

        result = pd.unique(
            Series(Index([Timestamp('20160101', tz='US/Eastern'),
                          Timestamp('20160101', tz='US/Eastern')])))
        expected = DatetimeArray._from_sequence(np.array([
            Timestamp('2016-01-01', tz="US/Eastern"),
        ]))
        tm.assert_extension_array_equal(result, expected)

        result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
                                  Timestamp('20160101', tz='US/Eastern')]))
        expected = DatetimeIndex(['2016-01-01 00:00:00'],
                                 dtype='datetime64[ns, US/Eastern]', freq=None)
        tm.assert_index_equal(result, expected) 
Example 66
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_tuple_with_strings(self, arg, expected):
        # see GH 17108
        result = pd.unique(arg)
        tm.assert_numpy_array_equal(result, expected) 
Example 67
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_obj_none_preservation(self):
        # GH 20866
        arr = np.array(['foo', None], dtype=object)
        result = pd.unique(arr)
        expected = np.array(['foo', None], dtype=object)

        tm.assert_numpy_array_equal(result, expected, strict_nan=True) 
Example 68
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_signed_zero(self):
        # GH 21866
        a = np.array([-0.0, 0.0])
        result = pd.unique(a)
        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
        tm.assert_numpy_array_equal(result, expected) 
Example 69
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_different_nans(self):
        # GH 21866
        # create different nans from bit-patterns:
        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
        assert NAN1 != NAN1
        assert NAN2 != NAN2
        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
        result = pd.unique(a)
        expected = np.array([np.nan])
        tm.assert_numpy_array_equal(result, expected) 
Example 70
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_unique_tuples(self, arr, unique):
        # https://github.com/pandas-dev/pandas/issues/16519
        expected = np.empty(len(unique), dtype=object)
        expected[:] = unique

        result = pd.unique(arr)
        tm.assert_numpy_array_equal(result, expected) 
Example 71
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_get_unique(self):
        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
        exp = np.array([1, 2, 2**63], dtype=np.uint64)
        tm.assert_numpy_array_equal(s.unique(), exp) 
Example 72
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_hashtable_unique(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.unique()
        expected_unique = s_duplicated.drop_duplicates(keep='first').values
        result_unique = htable().unique(s_duplicated.values)
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # test return_inverse=True
        # reconstruction can only succeed if the inverse is correct
        result_unique, result_inverse = htable().unique(s_duplicated.values,
                                                        return_inverse=True)
        tm.assert_numpy_array_equal(result_unique, expected_unique)
        reconstr = result_unique[result_inverse]
        tm.assert_numpy_array_equal(reconstr, s_duplicated.values) 
Example 73
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_hashtable_factorize(self, htable, tm_dtype, writable):
        # output of maker has guaranteed unique elements
        maker = getattr(tm, 'make' + tm_dtype + 'Index')
        s = Series(maker(1000))
        if htable == ht.Float64HashTable:
            # add NaN for float column
            s.loc[500] = np.nan
        elif htable == ht.PyObjectHashTable:
            # use different NaN types for object column
            s.loc[500:502] = [np.nan, None, pd.NaT]

        # create duplicated selection
        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
        s_duplicated.values.setflags(write=writable)
        na_mask = s_duplicated.isna().values

        result_unique, result_inverse = htable().factorize(s_duplicated.values)

        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
        # and is tested separately; keeps first occurrence like ht.factorize()
        # since factorize removes all NaNs, we do the same here
        expected_unique = s_duplicated.dropna().drop_duplicates().values
        tm.assert_numpy_array_equal(result_unique, expected_unique)

        # reconstruction can only succeed if the inverse is correct. Since
        # factorize removes the NaNs, those have to be excluded here as well
        result_reconstruct = result_unique[result_inverse[~na_mask]]
        expected_reconstruct = s_duplicated.dropna().values
        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct) 
Example 74
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 5 votes vote down vote up
def test_unique_label_indices():

    a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')

    left = ht.unique_label_indices(a)
    right = np.unique(a, return_index=True)[1]

    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False)

    a[np.random.choice(len(a), 10)] = -1
    left = ht.unique_label_indices(a)
    right = np.unique(a, return_index=True)[1][1:]
    tm.assert_numpy_array_equal(left, right,
                                check_dtype=False) 
Example 75
Project: recruit   Author: Frank-qlu   File: base.py    Apache License 2.0 5 votes vote down vote up
def unique(self):
        """
        Compute the ExtensionArray of unique values.

        Returns
        -------
        uniques : ExtensionArray
        """
        from pandas import unique

        uniques = unique(self.astype(object))
        return self._from_sequence(uniques, dtype=self.dtype) 
Example 76
Project: gullikson-scripts   Author: kgullikson88   File: Analyze_CCF.py    MIT License 4 votes vote down vote up
def get_temperature_run(self, starname=None, date=None, df=None):
        """
        Return the maximum ccf height for each temperature. Either starname AND date, or df must be given

        Parameters:
        ===========
        - starname:      string
                         The name of the star

        - param date:    string
                         The date of the observation

        - param df:      pandas DataFrame
                         Input dataframe, such as from _compile_data. Overrides starname and date, if given

        Returns:
        ========
        - temp_run:      pandas DataFrame
                         Contains all the best parameters for each temperature
        """
        # Get the dataframe if it isn't given
        if df is None:
            if starname is None or date is None:
                raise ValueError('Must give either starname or date to get_temperature_run!')
            df = self._compile_data(starname, date)

        # Find the maximum CCF for each set of parameters
        fcn = lambda row: (np.max(row), self.velocities[np.argmax(row)])
        vals = df['ccf'].map(fcn)
        df['ccf_max'] = vals.map(lambda l: l[0])
        df['rv'] = vals.map(lambda l: l[1])

        # Find the best parameters for each temperature
        d = defaultdict(list)
        temperatures = pd.unique(df['T'])
        for T in temperatures:
            good = df.loc[df['T'] == T]
            best = good.loc[good.ccf_max == good.ccf_max.max()]
            d['vsini'].append(best['vsini'].item())
            d['logg'].append(best['logg'].item())
            d['[Fe/H]'].append(best['[Fe/H]'].item())
            d['rv'].append(best['rv'].item())
            d['ccf_value'].append(best.ccf_max.item())
            d['T'].append(T)
            d['metal'].append(best['[Fe/H]'].item())

        return pd.DataFrame(data=d) 
Example 77
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: base.py    MIT License 4 votes vote down vote up
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]:
        """
        Encode the extension array as an enumerated type.

        Parameters
        ----------
        na_sentinel : int, default -1
            Value to use in the `labels` array to indicate missing values.

        Returns
        -------
        labels : ndarray
            An integer NumPy array that's an indexer into the original
            ExtensionArray.
        uniques : ExtensionArray
            An ExtensionArray containing the unique values of `self`.

            .. note::

               uniques will *not* contain an entry for the NA value of
               the ExtensionArray if there are any missing values present
               in `self`.

        See Also
        --------
        factorize : Top-level factorize method that dispatches here.

        Notes
        -----
        :meth:`pandas.factorize` offers a `sort` keyword as well.
        """
        # Implementer note: There are two ways to override the behavior of
        # pandas.factorize
        # 1. _values_for_factorize and _from_factorize.
        #    Specify the values passed to pandas' internal factorization
        #    routines, and how to convert from those values back to the
        #    original ExtensionArray.
        # 2. ExtensionArray.factorize.
        #    Complete control over factorization.
        from pandas.core.algorithms import _factorize_array

        arr, na_value = self._values_for_factorize()

        labels, uniques = _factorize_array(
            arr, na_sentinel=na_sentinel, na_value=na_value
        )

        uniques = self._from_factorized(uniques, self)
        return labels, uniques 
Example 78
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 4 votes vote down vote up
def prepareRegressData(Y, X=None, x_varnames=None, has_constant=False, dummy_data=None, drop_dummy_na=False):
    NotNAMask = pd.notnull(Y)
    if X is None:
        if (dummy_data is None) and (not has_constant):
            return (NotNAMask,[],Y[NotNAMask],X[NotNAMask])
        x_varnames = []
    else:
        if np.ndim(X)>1:
            NotNAMask = ((np.sum(pd.isnull(X),axis=1)==0) & NotNAMask)
        else:
            NotNAMask = (pd.notnull(X) & NotNAMask)
            X = X.reshape((X.shape[0],1))
        if x_varnames is None:
            x_varnames = ["x_"+str(i) for i in range(X.shape[1])]
    # 展开Dummy因子
    if dummy_data is not None:
        if np.ndim(dummy_data)==1:
            dummy_data = dummy_data.reshape((dummy_data.shape[0],1))
        if drop_dummy_na:
            NotNAMask = (NotNAMask & (np.sum(pd.isnull(dummy_data),axis=1)==0))
        else:
            dummy_data[pd.isnull(dummy_data)] = np.nan
        dummy_data = dummy_data[NotNAMask]
        if X is not None:
            X = X[NotNAMask]
        Y = Y[NotNAMask]
        for i in range(dummy_data.shape[1]):
            AllCats = pd.unique(dummy_data[:,i])
            if (has_constant) or (i>0):
                AllCats = AllCats[:-1]
            if AllCats.shape[0]==0:
                continue
            iX = np.zeros((dummy_data.shape[0],AllCats.shape[0]))
            for j,jCat in enumerate(AllCats):
                if pd.isnull(jCat):
                    iX[pd.isnull(dummy_data[:,i]),j] = 1.0
                else:
                    iX[dummy_data[:,i]==jCat,j] = 1.0
            if X is not None:
                X = np.hstack((X,iX))
            else:
                X = iX
            x_varnames += list(AllCats)
    else:
        if X is not None:
            X = X[NotNAMask]
        Y = Y[NotNAMask]
    if has_constant:
        if X is None:
            X = np.ones((Y.shape[0],1))
        elif X.shape[0]>0:
            X = sm.add_constant(X, prepend=True)
        else:
            X = X.reshape((0,X.shape[1]+1))
        x_varnames = ["constant"]+x_varnames
    return (NotNAMask, x_varnames, Y, X)
# Z-Score 标准化
# data: 待标准化的数据, array; cat_data: 分类数据, array
# avg_statistics: 平均统计量, 可选: 平均值, 中位数; dispersion_statistics: 离散统计量, 可选: 标准差, MAD
# avg_weight: 计算平均度的权重; dispersion_weight: 计算离散度的权重 
Example 79
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 4 votes vote down vote up
def _adjustRawDataByRelatedField(self, raw_data, fields):
        RelatedFields = self._FactorInfo["RelatedSQL"].loc[fields]
        RelatedFields = RelatedFields[pd.notnull(RelatedFields)]
        if RelatedFields.shape[0]==0: return raw_data
        for iField in RelatedFields.index:
            iOldData = raw_data.pop(iField)
            iDataType = _identifyDataType(self._FactorInfo.loc[iField, "DataType"])
            if iDataType=="double":
                iNewData = pd.Series(np.nan, index=raw_data.index, dtype="float")
            else:
                iNewData = pd.Series(np.full(shape=(raw_data.shape[0], ), fill_value=None, dtype="O"), index=raw_data.index, dtype="O")
            iSQLStr = self._FactorInfo.loc[iField, "RelatedSQL"]
            if iSQLStr[0]=="{":
                iMapInfo = eval(iSQLStr).items()
            else:
                iStartIdx = iSQLStr.find("{KeyCondition}")
                if iStartIdx!=-1:
                    iEndIdx = iSQLStr[iStartIdx:].find(" ")
                    if iEndIdx==-1: iEndIdx = len(iSQLStr)
                    else: iEndIdx += iStartIdx
                    iStartIdx += 14
                    KeyField = iSQLStr[iStartIdx:iEndIdx]
                    iOldDataType = _identifyDataType(self._FactorInfo.loc[iField[:-2], "DataType"])
                    iKeys = iOldData[pd.notnull(iOldData)].unique().tolist()
                    if iKeys:
                        KeyCondition = genSQLInCondition(KeyField, iKeys, is_str=(iOldDataType!="double"))
                    else:
                        KeyCondition = KeyField+" IN (NULL)"
                    iSQLStr = iSQLStr.replace("{KeyCondition}"+KeyField, "{KeyCondition}")
                else:
                    KeyCondition = ""
                if iSQLStr.find("{Keys}")!=-1:
                    Keys = ", ".join([str(iKey) for iKey in iOldData[pd.notnull(iOldData)].unique()])
                    if not Keys: Keys = "NULL"
                else:
                    Keys = ""
                if iSQLStr.find("{SecuCode}")!=-1:
                    SecuCode = self._getSecuMainIDField()
                else:
                    SecuCode = ""
                iMapInfo = self._FactorDB.fetchall(iSQLStr.format(TablePrefix=self._FactorDB.TablePrefix, Keys=Keys, KeyCondition=KeyCondition, SecuCode=SecuCode))
            for jVal, jRelatedVal in iMapInfo:
                if pd.notnull(jVal):
                    iNewData[iOldData==jVal] = jRelatedVal
                else:
                    iNewData[pd.isnull(iOldData)] = jRelatedVal
            raw_data[iField] = iNewData
        return raw_data 
Example 80
Project: recruit   Author: Frank-qlu   File: test_algos.py    Apache License 2.0 4 votes vote down vote up
def test_categorical(self):

        # we are expecting to return in the order
        # of appearance
        expected = Categorical(list('bac'), categories=list('bac'))

        # we are expecting to return in the order
        # of the categories
        expected_o = Categorical(
            list('bac'), categories=list('abc'), ordered=True)

        # GH 15939
        c = Categorical(list('baabc'))
        result = c.unique()
        tm.assert_categorical_equal(result, expected)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected)

        c = Categorical(list('baabc'), ordered=True)
        result = c.unique()
        tm.assert_categorical_equal(result, expected_o)

        result = algos.unique(c)
        tm.assert_categorical_equal(result, expected_o)

        # Series of categorical dtype
        s = Series(Categorical(list('baabc')), name='foo')
        result = s.unique()
        tm.assert_categorical_equal(result, expected)

        result = pd.unique(s)
        tm.assert_categorical_equal(result, expected)

        # CI -> return CI
        ci = CategoricalIndex(Categorical(list('baabc'),
                                          categories=list('bac')))
        expected = CategoricalIndex(expected)
        result = ci.unique()
        tm.assert_index_equal(result, expected)

        result = pd.unique(ci)
        tm.assert_index_equal(result, expected)