Python pandas.isnull() Examples

The following are code examples for showing how to use pandas.isnull(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: programsynthesishunting   Author: flexgp   File: individual.py    GNU General Public License v3.0 6 votes vote down vote up
def __lt__(self, other):
        """
        Set the definition for comparison of two instances of the individual
        class by their fitness values. Allows for sorting/ordering of a
        population of individuals. Note that numpy NaN is used for invalid
        individuals and is used by some fitness functions as a default fitness.
        We implement a custom catch for these NaN values.

        :param other: Another instance of the individual class (i.e. another
        individual) with which to compare.
        :return: Whether or not the fitness of the current individual is
        greater than the comparison individual.
        """

        if pd.isnull(self.fitness): return True
        elif pd.isnull(other.fitness): return False
        else: return self.fitness < other.fitness if params['FITNESS_FUNCTION'].maximise else other.fitness < self.fitness 
Example 2
Project: programsynthesishunting   Author: flexgp   File: individual.py    GNU General Public License v3.0 6 votes vote down vote up
def __le__(self, other):
        """
        Set the definition for comparison of two instances of the individual
        class by their fitness values. Allows for sorting/ordering of a
        population of individuals. Note that numpy NaN is used for invalid
        individuals and is used by some fitness functions as a default fitness.
        We implement a custom catch for these NaN values.

        :param other: Another instance of the individual class (i.e. another
        individual) with which to compare.
        :return: Whether or not the fitness of the current individual is
        greater than or equal to the comparison individual.
        """

        if pd.isnull(self.fitness): return True
        elif pd.isnull(other.fitness): return False
        else: return self.fitness <= other.fitness if params['FITNESS_FUNCTION'].maximise else other.fitness <= self.fitness 
Example 3
Project: Feature-Stuff   Author: hiflyin   File: categorical.py    MIT License 6 votes vote down vote up
def encode_labels(df, cols = None):
    '''
    Inputs:
        df: a pandas dataframe containing the column for which to calculate target encoding (categ_col)
        cols: all columns' names for which to do label encoding . If is None (default) then all object columns are taken.
    Output: df with cols replaced the coresponding label encodings while maintaining all existing None values at their positions.
    '''

    le = LabelEncoder()
    for col in cols:
        # pick some random value from the col - will make it null back at the end anyway
        null_replacement = df[col].values[0]
        # save col null positions and set ones for the rest
        nan_col = np.array([1 if not pd.isnull(x) else x for x in df[col]])
        # replace nulls in the original array, and fit on it
        a = np.array([x if not pd.isnull(x) else null_replacement for x in df[col]])
        le.fit(a)
        # transform the data and add the nulls back
        df[col] = le.transform(a) * nan_col

    return(df) 
Example 4
Project: watttime-python-client   Author: WattTime   File: test_client.py    Apache License 2.0 6 votes vote down vote up
def test_get_impact_between_ffill(self):
        # get data
        series_pre = self.impacter.get_impact_between(self.start_at, self.end_at,
                                                      interval_minutes=5, ba='PJM')
        self.assertIsNotNone(series_pre[-1])

        # fake cache with null value
        self.impacter.insert_to_cache(self.end_at, 'PJM', 'RT5M', None)

        # get data again, forward fills
        series_post = self.impacter.get_impact_between(self.start_at, self.end_at,
                                                       interval_minutes=5, ba='PJM')
        self.assertEqual(series_pre[-1], series_post[-1])

        # get data again, no fill
        series_na = self.impacter.get_impact_between(self.start_at, self.end_at,
                                                     interval_minutes=5, ba='PJM',
                                                     fill=False)
        self.assertTrue(pd.isnull(series_na[-1])) 
Example 5
Project: EarlyWarning   Author: wjlei1990   File: utils.py    GNU General Public License v3.0 6 votes vote down vote up
def validate_numerical_dtype(df):
    col_dtypes = get_columns_dtype(df)
    if np.dtype(object) in col_dtypes:
        cols_object = col_dtypes[np.dtype(object)]
        print("bad %s[%d]: %s" % (np.dtype(object), len(cols_object),
                                  cols_object))
        raise ValueError("Object still in columns")

    nan_flag = df.isnull().values.any()
    if nan_flag:
        for col in df:
            n_nan = df[col].isnull().sum()
            if n_nan > 0:
                print("column %s has nan values: %d" % (col, n_nan))
        raise ValueError("Still NaN values in current dataframe")

    print("-" * 10 + " Validation passed " + "-" * 10) 
Example 6
Project: DETAD   Author: HumamAlwassel   File: sensitivity_analysis.py    MIT License 6 votes vote down vote up
def compute_mAP_N(result,this_cls_pred,this_cls_gt):
    ap = np.zeros(len(result.tiou_thresholds))
    tp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))
    fp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))

    for tidx, tiou in enumerate(result.tiou_thresholds): 
        fp[tidx,pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values] = 1
        tp[tidx,~(pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values)] = 1

    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float)
    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float)
    recall_cumsum = tp_cumsum / len(np.unique(this_cls_gt['gt-id']))
    precision_cumsum = recall_cumsum * result.average_num_instance_per_class / (recall_cumsum * result.average_num_instance_per_class + fp_cumsum)

    for tidx in range(len(result.tiou_thresholds)):
        ap[tidx] = interpolated_prec_rec(precision_cumsum[tidx,:], recall_cumsum[tidx,:])
    
    return ap.mean()

# Initialize true positive and false positive vectors. 
Example 7
Project: group-contribution   Author: bdu91   File: process_thermo_data.py    MIT License 6 votes vote down vote up
def get_TECRDB_rxn_data(self, file_name):
        """
        Extract thermodynamic data for reactions, including transformed reaction energy (dG_r), transformed reaction enthalpy (dH_r)

        Write into a dictionary that stores dG_r and dH_r, each reaction data point is numerically labeled for convenience of later reference (e.g. Keq_1, deltaH_23)
        """

        self.all_thermo_data_dict['dG_r'] = {}
        self.all_thermo_data_dict['dH_r'] = {}
        rxns_thermo_data = pd.read_csv(file_name)
        for i, rxn_data_dict in rxns_thermo_data.iterrows():
            cur_r_dict = {'pH': rxn_data_dict['pH'], 'IS': rxn_data_dict['IS'], 'T': rxn_data_dict['T'], 'rxn_formula':rxn_data_dict['rxn_formula'], \
                          'rxn_dict': self.parse_formula(rxn_data_dict['rxn_formula']), 'metal ions': {}}
            metal_ion_list = ['Mg', 'Co', 'Na', 'K', 'Mn', 'Zn', 'Li', 'Ca']
            for metal_ion in metal_ion_list:
                if not pd.isnull(rxn_data_dict[metal_ion]):
                    cur_r_dict['metal ions'][metal_ion] = rxn_data_dict[metal_ion]
            if not pd.isnull(rxn_data_dict['Keq']):
                cur_r_dict['Keq'] = rxn_data_dict['Keq']
                self.all_thermo_data_dict['dG_r'][rxn_data_dict['rxn_id']] = cur_r_dict
            if not pd.isnull(rxn_data_dict['deltaH']):
                cur_r_dict['deltaH'] = rxn_data_dict['deltaH']   
                self.all_thermo_data_dict['dH_r'][rxn_data_dict['rxn_id']] = cur_r_dict 
Example 8
Project: recordlinkage   Author: J535D165   File: compare.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_vectorized(self, s_left, s_right):

        # Values or agree/disagree
        if self.agree_value == 'value':
            compare = s_left.copy()
            compare[s_left != s_right] = self.disagree_value

        else:
            compare = pandas.Series(self.disagree_value, index=s_left.index)
            compare[s_left == s_right] = self.agree_value

        # Only when disagree value is not identical with the missing value
        if self.disagree_value != self.missing_value:
            compare[(s_left.isnull() | s_right.isnull())] = self.missing_value

        return compare 
Example 9
Project: recordlinkage   Author: J535D165   File: compare.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_frequency(self, col):

        # https://github.com/pydata/pandas/issues/3729
        na_value = 'NAN'
        value_count = col.fillna(na_value)

        c = value_count.groupby(by=value_count).transform('count')
        c = c.astype(numpy.float64)

        if self.normalise:
            c = c / len(col)

        # replace missing values
        c[col.isnull()] = self.missing_value

        return c 
Example 10
Project: recordlinkage   Author: J535D165   File: string.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    def levenshtein_apply(x):

        try:
            return 1 - jellyfish.levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(levenshtein_apply) 
Example 11
Project: recordlinkage   Author: J535D165   File: string.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def damerau_levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    def damerau_levenshtein_apply(x):

        try:
            return 1 - jellyfish.damerau_levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(damerau_levenshtein_apply) 
Example 12
Project: quail   Author: ContextLab   File: recmat.py    MIT License 6 votes vote down vote up
def _recmat_exact(presented, recalled, features):
    lists = presented.index.get_values()
    cols = max(presented.shape[1], recalled.shape[1])
    result = np.empty((presented.shape[0], cols))*np.nan
    for li, l in enumerate(lists):
        p_list = presented.loc[l]
        r_list = recalled.loc[l]
        for i, feature in enumerate(features):
            get_feature = lambda x: np.array(x[feature]) if not np.array(pd.isnull(x['item'])).any() else np.nan
            p = np.vstack(p_list.apply(get_feature).get_values())
            r = r_list.dropna().apply(get_feature).get_values()
            r = np.vstack(list(filter(lambda x: x is not np.nan, r)))
            try:
                m = [np.where((p==x).all(axis=1))[0] for x in r]
            except AttributeError:
                m = []
            result[li, :len(m)] = [x[0]+1 if len(x)>0 else np.nan for x in m]
    return result 
Example 13
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: test_integer.py    MIT License 6 votes vote down vote up
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({"A": data_missing})
    result = df["A"].astype("object")
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name="A")
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df["A"].astype("object").values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e) 
Example 14
Project: QuantStudio   Author: Scorpi000   File: DefaultAccount.py    GNU General Public License v3.0 6 votes vote down vote up
def _updateTradeLimit(self, idt):
        self._SellVolLimit[:] = self._BuyVolLimit[:] = np.inf
        TradePrice = self._MarketFT.readData(factor_names=[self.BuyLimit.TradePrice, self.SellLimit.TradePrice], ids=self._IDs, dts=[idt]).iloc[:,0,:]
        self._BuyPrice, self._SellPrice = TradePrice.iloc[:, 0], TradePrice.iloc[:, 1]
        self._BuyVolLimit[pd.isnull(self._BuyPrice) | (self._BuyPrice<=0)] = 0.0# 买入成交价缺失的不能买入
        self._SellVolLimit[pd.isnull(self._SellPrice) | (self._SellPrice<=0)] = 0.0# 卖出成交价缺失的不能卖出
        if self.SellLimit.LimitIDFilter:# 满足卖出禁止条件的不能卖出
            Mask = self._MarketFT.getIDMask(idt, ids=self._IDs, id_filter_str=self.SellLimit.LimitIDFilter)
            self._SellVolLimit[Mask] = 0.0
        if self.BuyLimit.LimitIDFilter:# 满足买入禁止条件的不能买入
            Mask = self._MarketFT.getIDMask(idt, ids=self._IDs, id_filter_str=self.BuyLimit.LimitIDFilter)
            self._BuyVolLimit[Mask] = 0.0
        if self.BuyLimit.Amt is not None:# 指定了买入成交额, 成交额满足限制要求
            Amount = self._MarketFT.readData(factor_names=[self.BuyLimit.Amt], ids=self._IDs, dts=[idt]).iloc[0,0,:]
            self._BuyVolLimit = self._BuyVolLimit.clip_upper(Amount * self.BuyLimit.AmtLimitRatio / self._BuyPrice)
        if self.SellLimit.Amt is not None:# 指定了卖出成交额, 成交额满足限制要求
            Amount = self._MarketFT.readData(factor_names=[self.SellLimit.Amt], ids=self._IDs, dts=[idt]).iloc[0,0,:]
            self._SellVolLimit = self._SellVolLimit.clip_upper(Amount * self.SellLimit.AmtLimitRatio / self._SellPrice)
        if not self.SellLimit.ShortAllowed:
            PositionNum = self._PositionNum.iloc[self._Model.DateTimeIndex+1]
            self._SellVolLimit = self._SellVolLimit.clip_upper(PositionNum.clip_lower(0.0))
        return 0
    # 撮合成交订单 
Example 15
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 6 votes vote down vote up
def maskCategary(data_len,cat_data=None, mask=None):
    if mask is None:
        mask = (np.zeros((data_len,))==0)
    if cat_data is not None:
        cat_data[pd.isnull(cat_data)] = np.nan
        if cat_data.ndim==1:
            cat_data = cat_data.reshape((cat_data.shape[0],1))
        AllCats = [list(pd.unique(cat_data[mask,i])) for i in range(cat_data.shape[1])]
        AllCats = CartesianProduct(AllCats)
    else:
        AllCats = [(np.nan,)]
        cat_data = np.empty((data_len,1),dtype='float')+np.nan
    CatMask = {}
    for i,iCat in enumerate(AllCats):
        iMask = mask
        for j,jSubCat in enumerate(iCat):
            if pd.notnull(jSubCat):
                iMask = (iMask & (cat_data[:,j]==jSubCat))
            else:
                iMask = (iMask & pd.isnull(cat_data[:,j]))
        CatMask[tuple(iCat)] = iMask
    return CatMask
# 准备回归的数据 
Example 16
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 6 votes vote down vote up
def neutralize(Y, X, cov_matrix, mask=None, constant=False, dummy_data=None, drop_dummy_na=False, other_handle='填充None'):
    StdData = np.empty(Y.shape,dtype='float')+np.nan
    if mask is None:
        mask = pd.isnull(StdData)
    cov_matrix = cov_matrix[mask,:][:,mask]
    NotNAMask,_,YY,XX = prepareRegressData(Y[mask], (X[mask] if X is not None else X), has_constant=constant, dummy_data=(dummy_data[mask] if dummy_data is not None else dummy_data),drop_dummy_na=drop_dummy_na)
    Mask = (np.sum(pd.notnull(cov_matrix),axis=1)>0)
    Mask = ((np.sum(pd.isnull(cov_matrix[:,Mask]),axis=1)==0) & Mask & NotNAMask)
    cov_matrix = cov_matrix[Mask,:][:,Mask]
    YY = YY[Mask[NotNAMask]]
    XX = XX[Mask[NotNAMask]]
    if XX.ndim==1:
        XX = np.reshape(XX,(XX.shape[0],1))
    Temp = StdData[mask]
    Temp[Mask] = YY - np.dot(np.dot(np.dot(np.dot(XX,np.linalg.inv(np.dot(np.dot(XX.T,cov_matrix),XX))),XX.T),cov_matrix),YY)
    StdData[mask] = Temp
    if other_handle=="保持不变":
        StdData[~mask] = Y[~mask]
    return StdData
# 合并因子数据
# data: 待处理的数据, [array,...] or array; method: 合成方式, 可选: 直接合成, 归一合成; nan_handle: 缺失处理, 可选: 剩余合成, 填充None 
Example 17
Project: QuantStudio   Author: Scorpi000   File: MatplotlibFun.py    GNU General Public License v3.0 6 votes vote down vote up
def plotCandleStick(ax, quotes, xdata=None, width=0.2, colorup='#B70203', colordown='#3ACCCC', alpha=1.0):
    if xdata is None: xdata = np.arange(quotes.shape[0])
    OFFSET = width / 2.0
    Lines, Patches = [], []
    for i in range(quotes.shape[0]):
        if pd.isnull(quotes[i]).sum()>0: continue
        iOpen, iHigh, iLow, iClose = quotes[i]
        if iClose >= iOpen:
            iColor = colorup
            iLower = iOpen
            iHeight = iClose - iOpen
        else:
            iColor = colordown
            iLower = iClose
            iHeight = iOpen - iClose
        iVLine = Line2D(xdata=(xdata[i], xdata[i]), ydata=(iLow, iHigh), color=iColor, linewidth=0.5, antialiased=True)
        iRect = Rectangle(xy=(i-OFFSET, iLower), width=width, height=iHeight, facecolor=iColor, edgecolor=iColor)
        iRect.set_alpha(alpha)
        Lines.append(iVLine)
        Patches.append(iRect)
        ax.add_line(iVLine)
        ax.add_patch(iRect)
    ax.autoscale_view()
    return Lines, Patches 
Example 18
Project: QuantStudio   Author: Scorpi000   File: AuxiliaryFun.py    GNU General Public License v3.0 6 votes vote down vote up
def changeMultiClass2SingleClass(multi_class, sep=None):
    MultiClass = []
    for i in range(multi_class.shape[1]):
        MultiClass.append(pd.unique(multi_class[:,i]).tolist())
    MultiClass = CartesianProduct(MultiClass)
    SingleClassData = np.empty(shape=(multi_class.shape[0],),dtype="O")
    ClassDict = {}
    for i,iMultiClass in enumerate(MultiClass):
        iMask = np.array([True]*multi_class.shape[0])
        if sep is not None:
            iSingleClass = sep.join(map(str,iMultiClass))
        else:
            iSingleClass = str(i)
        for j,jSubClass in enumerate(iMultiClass):
            if pd.notnull(jSubClass):
                iMask = iMask & (multi_class[:,j]==jSubClass)
            else:
                iMask = iMask & pd.isnull(multi_class[:,j])
        SingleClassData[iMask] = iSingleClass
        ClassDict[iSingleClass] = iMultiClass
    return (SingleClassData,ClassDict)
# 给定某一分类subclass, 返回class_data的属于该类别的Mask, 如果subclass是None,返回全选的Mask
# subclass: [类别名称], 例如: ['银行', '大盘']
# class_data: 类别数据, DataFrame(columns=[分类名称]) 或者 array 
Example 19
Project: QuantStudio   Author: Scorpi000   File: AuxiliaryFun.py    GNU General Public License v3.0 6 votes vote down vote up
def getClassMask(subclass,class_data):
    if isinstance(class_data, np.ndarray):
        Mask = np.array([True]*class_data.shape[0])
    else:
        Mask = pd.Series(True,index=class_data.index)
    if subclass is None:
        return Mask
    if isinstance(class_data, np.ndarray):
        for j,jSubClass in enumerate(subclass):
            if pd.notnull(jSubClass):
                Mask = Mask & (class_data[:,j]==jSubClass)
            else:
                Mask = Mask & pd.isnull(class_data[:,j])
    else:
        for j,jSubClass in enumerate(subclass):
            if pd.notnull(jSubClass):
                Mask = Mask & (class_data.iloc[:,j]==jSubClass)
            else:
                Mask = Mask & pd.isnull(class_data.iloc[:,j])
    return Mask

# 使得两个Series相匹配, 即 index 一致, 缺失的按照指定值填充 
Example 20
Project: QuantStudio   Author: Scorpi000   File: DataTypeConversionFun.py    GNU General Public License v3.0 6 votes vote down vote up
def DummyVarTo01Var(dummy_var,ignore_na=False,ignores=[],ignore_nonstring=False):
    if dummy_var.shape[0]==0:
        return pd.DataFrame()
    NAMask = pd.isnull(dummy_var)
    if ignore_na:
        AllClasses = dummy_var[~NAMask].unique()
    else:
        dummy_var[NAMask] = np.nan
        AllClasses = dummy_var.unique()
    AllClasses = [iClass for iClass in AllClasses if (iClass not in ignores) and ((not ignore_nonstring) or isinstance(iClass,str) or pd.isnull(iClass))]
    OZVar = pd.DataFrame(0.0,index=dummy_var.index,columns=AllClasses,dtype='float')
    for iClass in AllClasses:
        if pd.notnull(iClass):
            iMask = (dummy_var==iClass)
        else:
            iMask = NAMask
        OZVar[iClass][iMask] = 1.0
    return OZVar
# 将DataFrame转化成二重索引的Series,DataFrame的index和columns二重索引。 
Example 21
Project: QuantStudio   Author: Scorpi000   File: RiskModelFun.py    GNU General Public License v3.0 6 votes vote down vote up
def calcBlendingCoefficient(specific_ret):
    Gamma = {}
    for iID in specific_ret.columns:
        iSpecificRet = specific_ret[iID]
        iSpecificRet = iSpecificRet[pd.notnull(iSpecificRet)].values
        ih = iSpecificRet.shape[0]
        if ih==0:
            Gamma[iID]=0
            continue
        iRobustStd = 1/1.35*(np.percentile(iSpecificRet,75)-np.percentile(iSpecificRet,25))
        iSpecificRet[iSpecificRet>10*iRobustStd] = 10*iRobustStd
        iSpecificRet[iSpecificRet<-10*iRobustStd] = -10*iRobustStd
        iStd = np.std(iSpecificRet)
        iZVal = np.abs((iStd-iRobustStd)/iRobustStd)
        Gamma[iID] = min((1,max((0,(ih-60)/120))))*min((1,max((0,np.exp(1-iZVal)))))
    Gamma = pd.Series(Gamma,name='Gamma')
    Gamma[pd.isnull(Gamma)] = 0
    return Gamma
    
# 计算Structural forcast of specific risk 
Example 22
Project: QuantStudio   Author: Scorpi000   File: JYDB.py    GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, name, fdb, sys_args={}, **kwargs):
        self._DBTableName = fdb.TablePrefix + fdb._TableInfo.loc[name, "DBTableName"]
        self._FactorInfo = fdb._FactorInfo.loc[name]
        self._IDField = self._FactorInfo["DBFieldName"][self._FactorInfo["FieldType"]=="ID"].iloc[0]# ID 字段
        self._IDFieldIsStr = (_identifyDataType(self._FactorInfo["DataType"][self._FactorInfo["FieldType"]=="ID"].iloc[0])!="double")
        self._ConditionFields = self._FactorInfo[self._FactorInfo["FieldType"]=="Condition"].index.tolist()# 所有的条件字段列表
        self._MainTableName = fdb._TableInfo.loc[name, "MainTableName"]
        if pd.isnull(self._MainTableName):
            self._MainTableName = self._DBTableName
            self._MainTableID = self._IDField
            self._MainTableCondition = None
        else:
            self._MainTableName = fdb.TablePrefix + self._MainTableName
            self._MainTableID = fdb._TableInfo.loc[name, "MainTableID"]
            self._JoinCondition = fdb._TableInfo.loc[name, "JoinCondition"].format(DBTable=self._DBTableName, MainTable=self._MainTableName)
            self._MainTableCondition = fdb._TableInfo.loc[name, "MainTableCondition"]
            if pd.notnull(self._MainTableCondition):
                self._MainTableCondition = self._MainTableCondition.format(MainTable=self._MainTableName)
            self._IDFieldIsStr = True
        self._SecurityType = fdb._TableInfo.loc[name, "SecurityType"]
        return super().__init__(name=name, fdb=fdb, sys_args=sys_args, **kwargs) 
Example 23
Project: DataComp   Author: Cojabi   File: stats.py    Apache License 2.0 5 votes vote down vote up
def p_correction(p_values):
    """
    Corrects p_values for multiple testing.

    :param p_values: Dictionary storing p_values with corresponding feature names as keys.
    :return: DataFrame which shows the results of the analysis; p-value, corrected p-value and boolean indicating \
    significance.
    """

    p_trans = _transform_p_dict(p_values)

    # get and drop features which are NaN to skip them in multitest correction
    nan_features = p_trans[pd.isnull(p_trans[0])]
    p_trans = p_trans.dropna(axis=0, subset=[0])

    # extract p_value column to pass into multiple testing correction
    p_val_col = p_trans[0].sort_values()

    # add NaN features back to p_trans to include them into result table later on
    p_trans = pd.concat([p_trans, nan_features])

    # raise Error if no p_values where calculated that can be passed into multiple test correction
    if p_val_col.values.size == 0:
        # unpack the p_values which are stored in 2 layer nested dicts.
        nested_values = []
        for value in p_values.values():
            nested_values.append(*value.values())

        # if all p_values are nan, return an all nan result table
        if pd.isnull(nested_values).all():
            result_table = _create_result_table(None, p_val_col, p_trans, conf_invs, counts)
            return result_table.sort_index()

        raise ValueError("No p_values have been submitted into multiple test correction.")

    # correct p-values
    result = multipletests(p_val_col.values)

    return result, p_val_col, p_trans 
Example 24
Project: DataComp   Author: Cojabi   File: utils.py    Apache License 2.0 5 votes vote down vote up
def _categorical_table(data):
    """
    Returns the number of occurrences for the categories. Is used to build the observation table
    for a chi square test.

    :param data:
    :return:
    """
    # count occurences
    c = Counter(data)
    # delete NaNs
    c = {key: c[key] for key in c if not pd.isnull(key)}

    return pd.Series(c) 
Example 25
Project: contextualbandits   Author: david-cortes   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _check_fit_input(X, a, r, choice_names = None):
    X = _check_X_input(X)
    a = _check_1d_inp(a)
    r = _check_1d_inp(r)
    assert X.shape[0] == a.shape[0]
    assert X.shape[0] == r.shape[0]
    if choice_names is not None:
        a = pd.Categorical(a, choice_names).codes
        if pd.isnull(a).sum() > 0:
            raise ValueError("Input contains actions/arms that this object does not have.")
    return X, a, r 
Example 26
Project: watttime-python-client   Author: WattTime   File: test_client.py    Apache License 2.0 5 votes vote down vote up
def test_get_impact_between_caiso(self):
        series = self.impacter.get_impact_between(self.caiso_start, self.caiso_end,
                                                  interval_minutes=5, ba='CAISO',
                                                  fill=False)

        # no null values
        self.assertFalse(pd.isnull(series).any()) 
Example 27
Project: EarlyWarning   Author: wjlei1990   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def add_nan_feature_columns(df, columns=None, verbose=True):
    """
    add one new feature for each original column to keep the
    information of NaN values BEFORE impute.

    :param df: input dataframe
    :type df: pandas.DataFrame
    :return: No. change inplace
    """
    if columns is None:
        columns = df.columns

    shape1 = df.shape
    ncols_add = 0
    for col in columns:
        new_col = "{0}._nan_feature_".format(col)
        new_col_data = pd.isnull(df[col]).astype(INT_COL_DTYPE)
        if new_col_data.sum() > 0:
            df[new_col] = new_col_data
            ncols_add += 1
    shape2 = df.shape

    if verbose:
        print("Columns added nan feature: %d/%d" %
              (ncols_add, len(columns)))
        print("Shape change after adding the nan feature: {0} --> "
              "{1}".format(shape1, shape2)) 
Example 28
Project: EarlyWarning   Author: wjlei1990   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def check_y_null(data_y):
    """ Check the number of NaN values in each column of y data """
    results = {}
    for col in data_y.columns:
        data = data_y[col]
        null_idx = [idx for (idx, v) in enumerate(data.isnull()) if v]
        results[col] = null_idx
        print("Number of null values in [%s]: %d/%d" %
              (col, len(null_idx), len(data)))

    return results 
Example 29
Project: EarlyWarning   Author: wjlei1990   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def extract_good_train_data(train_x, data_y, col):
    """
    clean the data since data_y contains NaN values in certain challengeID.
    The current plan is drop those row with NaN in y.
    """
    # extract good(not NaN) train_y
    good_ids = []
    good_locs = []
    bad_locs = []
    for iloc, v in enumerate(data_y[col].isnull()):
        if not v:
            good_ids.append(data_y["challengeID"][iloc])
            good_locs.append(iloc)
        else:
            bad_locs.append(iloc)

    print("Null y length: %d" % len(bad_locs))
    print("Not null y length: %d" % len(good_locs))
    train_y_clean = data_y.iloc[good_locs]
    print("train_y shape: ", train_y_clean.shape)

    train_x_clean = train_x.iloc[good_locs]
    print("train_x shape: ", train_x_clean.shape)
    train_x_bad = train_x.iloc[bad_locs]

    _validate_train_data(train_x_clean, train_y_clean)
    if train_x_clean.isnull().values.any():
        raise ValueError("NaN values in train_x_clean")
    if train_y_clean[col].isnull().values.any():
        raise ValueError("NaN values in train_y_clean")

    _good_ids = train_x_clean["challengeID"]
    for id1, id2 in zip(good_ids, _good_ids):
        if id1 != id2:
            raise ValueError("Error in good_ids")

    return train_x_clean.as_matrix(), train_y_clean[col].as_matrix(), \
        train_x_bad, _good_ids 
Example 30
Project: PathMe   Author: PathwayMerger   File: mappings_parser.py    Apache License 2.0 5 votes vote down vote up
def get_mapped_pathways(dataframe):
    """Get pathways with mappings.

    :param pandas.DataFrame dataframe: data frame mappings
    :returns: pathway mappings
    :rtype: list[tuple[str][str][str]]
    """
    mappings = list()

    for _, row in dataframe.iterrows():

        equivalent_to_mappings = row['equivalentTo Mappings']

        if not pd.isnull(equivalent_to_mappings):

            for mapping_statement in equivalent_to_mappings.split("\n"):

                if mapping_statement == '':
                    continue

                reference_pathway, compared_pathway = get_pathways_from_statement(mapping_statement, "equivalentTo")

                mappings.append((reference_pathway, "equivalentTo", compared_pathway))

        is_part_of_mappings = row['isPartOf Mappings']

        if not pd.isnull(is_part_of_mappings):

            for mapping_statement in is_part_of_mappings.split('\n'):

                if mapping_statement == '':
                    continue

                reference_pathway, compared_pathway = parse_part_of_mapping(mapping_statement)

                mappings.append((reference_pathway, "isPartOf", compared_pathway))

    return mappings 
Example 31
Project: group-contribution   Author: bdu91   File: dSr_calculation.py    MIT License 5 votes vote down vote up
def process_dSf_training_data(self):
        """
        Process the training data for dSf calculation, including data for dSf and dSr (linear combination of dSr)
        """
        self.dSf_training_sids = self.dSf_pKMg_data_df[~pd.isnull(self.dSf_pKMg_data_df['dS_f(J/K/mol)'])].species_id.tolist()
        self.dSr_training_formula_list = self.dSr_data_df.rxn_formula.tolist()
        self.dSf_data = [self.dSf_pKMg_data_df[self.dSf_pKMg_data_df.species_id == cur_sid]['dS_f(J/K/mol)'].tolist()[0] for cur_sid in self.dSf_training_sids]
        self.dSr_data = [self.dSr_data_df[self.dSr_data_df.rxn_formula == cur_formula].dS_r_from_slope.tolist()[0] for cur_formula in self.dSr_training_formula_list]
        self.dSf_data_total = np.array(self.dSf_data + self.dSr_data) 
Example 32
Project: group-contribution   Author: bdu91   File: process_TECRDB_compounds.py    MIT License 5 votes vote down vote up
def get_TECRDB_compounds_data(self):
        """
        reads in data for compounds in TECRDB
        :return: a dictionary with keys being different ion bound states of the compound (we call it species_id here, e.g CHB_15422_-1 refers to -1 charged form of
        compound_id CHB_15422), values being a dictionary storing the thermodynamic information and molecular properties of the species_id
        """
        TECRDB_compounds_data_table = pd.read_csv('data/TECRDB_compounds_data.csv')
        #all possible information that the particular ion bound state can have
        data_entry_list = ['Cp', 'H_number', 'binding_constant', 'charge', 'dG_f', 'dH_f', 'dS_f', 'groups', 'metal_type','smiles_form','metal_number']
        for i, row in TECRDB_compounds_data_table.iterrows():
            cur_sid = row['species_id']
            cur_cid = row['compound_id']
            self.TECRDB_compounds_data_dict[cur_sid] = {'compound_id':cur_cid}
            if row['is_pH7_species'] == True:
                self.TECRDB_compounds_pH7_species_id_dict[cur_cid] = cur_sid
            if row['is_least_protonated_species'] == True:
                self.TECRDB_compounds_least_H_sid_dict[cur_cid] = cur_sid
            for data_entry in data_entry_list:
                if not pd.isnull(row[data_entry]):
                    if data_entry == 'groups':
                        #convert the text form of groups to python list
                        cur_sid_groups = map(float,row['groups'].strip('[').strip(']').split(','))
                        self.TECRDB_compounds_data_dict[cur_sid]['groups'] = cur_sid_groups
                    else:
                        try:
                            #convert value from string to float
                            self.TECRDB_compounds_data_dict[cur_sid][data_entry] = float(row[data_entry])
                        except ValueError:
                            self.TECRDB_compounds_data_dict[cur_sid][data_entry] = row[data_entry] 
Example 33
Project: recordlinkage   Author: J535D165   File: annotation.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _cast_value(value, na_value=None):

        if pd.isnull(value):
            return na_value
        elif type(value).__module__ == np.__name__:
            return value.item()
        else:
            return value 
Example 34
Project: recordlinkage   Author: J535D165   File: compare.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _compute_vectorized(self, s_left, s_right):

        if self.method == 'jaro':
            str_sim_alg = jaro_similarity
        elif self.method in ['jarowinkler', 'jaro_winkler', 'jw']:
            str_sim_alg = jarowinkler_similarity
        elif self.method == 'levenshtein':
            str_sim_alg = levenshtein_similarity
        elif self.method in [
                'dameraulevenshtein', 'damerau_levenshtein', 'dl'
        ]:
            str_sim_alg = damerau_levenshtein_similarity
        elif self.method in ['q_gram', 'qgram']:
            str_sim_alg = qgram_similarity
        elif self.method == 'cosine':
            str_sim_alg = cosine_similarity
        elif self.method in ['smith_waterman', 'smithwaterman', 'sw']:
            str_sim_alg = smith_waterman_similarity
        elif self.method in ['longest_common_substring', 'lcs']:
            str_sim_alg = longest_common_substring_similarity
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = str_sim_alg(s_left, s_right)

        if self.threshold is not None:
            c = c.where((c < self.threshold) | (pandas.isnull(c)), other=1.0)
            c = c.where((c >= self.threshold) | (pandas.isnull(c)), other=0.0)

        c = _fillna(c, self.missing_value)

        return c 
Example 35
Project: recordlinkage   Author: J535D165   File: string.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def jarowinkler_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    def jaro_winkler_apply(x):
        try:
            return jellyfish.jaro_winkler(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(jaro_winkler_apply) 
Example 36
Project: arctic   Author: man-group   File: numpy_arrays.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def _convert_types(self, a):
        """
        Converts object arrays of strings to numpy string arrays
        """
        # No conversion for scalar type
        if a.dtype != 'object':
            return a, None

        # We can't infer the type of an empty array, so just
        # assume strings
        if len(a) == 0:
            return a.astype('U1'), None

        # Compute a mask of missing values. Replace NaNs and Nones with
        # empty strings so that type inference has a chance.
        mask = pd.isnull(a)
        if mask.sum() > 0:
            a = a.copy()
            np.putmask(a, mask, '')
        else:
            mask = None

        if infer_dtype(a, skipna=False) == 'mixed':
            # assume its a string, otherwise raise an error
            try:
                a = np.array([s.encode('ascii') for s in a])
                a = a.astype('O')
            except:
                raise ValueError("Column of type 'mixed' cannot be converted to string")

        type_ = infer_dtype(a, skipna=False)
        if type_ in ['unicode', 'string']:
            max_len = max_len_string_array(a)
            return a.astype('U{:d}'.format(max_len)), mask
        else:
            raise ValueError('Cannot store arrays with {} dtype'.format(type_)) 
Example 37
Project: quail   Author: ContextLab   File: helpers.py    MIT License 5 votes vote down vote up
def check_nan(x):
    y = pd.isnull(x)
    if type(y) is bool:
        return y
    else:
        return False 
Example 38
Project: msdas   Author: cokelaer   File: replicates.py    GNU General Public License v3.0 5 votes vote down vote up
def plot_na_per_experiment(self, percent=False):
        """plot curve showing number of NAs per experiment

        .. plot::
            :include-source:
            :width: 70%

            from msdas import *
            from easydev import gsf
            filename= gsf("msdas", "data", "YEAST_raw_sample.csv")
            r = Replicates(filename, verbose=False)
            r.plot_na_per_experiment()



        """
        N = len(self.df)
        pylab.clf()
        if percent == False:
            count = pd.isnull(self.measurements).sum(axis=0)
            count.plot()
            pylab.ylabel("Number of NAs per experiment")
            pylab.ylim([0, N])
        else:
            count = pd.isnull(self.measurements).sum(axis=0)
            count/=float(N)
            count *= 100
            count.plot()
            pylab.ylabel("Number of NAs (percentage) per experiment")
            pylab.ylim([0,101])

        pylab.xticks(range(0, self.N), self.measurements.columns, rotation=90)



        pylab.tight_layout() 
Example 39
Project: msdas   Author: cokelaer   File: replicates.py    GNU General Public License v3.0 5 votes vote down vote up
def get_na_per_experiment_after_averaging(self):
        """average data and figure out number of NA per alpha experiment"""
        mu = self.get_mu_df()

        na = pd.DataFrame()
        for exp in ['a0', 'a1', 'a5', 'a10', 'a20', 'a45']:
            sum_na = pd.isnull(mu[[x for x in mu.columns if x.startswith(exp+"_")]]).sum(axis=1)
            na[exp] = sum_na
        return na 
Example 40
Project: msdas   Author: cokelaer   File: replicates.py    GNU General Public License v3.0 5 votes vote down vote up
def get_na_per_salt_experiment_after_averaging(self):
        """average data and figure out number of NA per alpha experiment"""
        mu = self.get_mu_df()

        na = pd.DataFrame()
        for exp in ['t0', 't1', 't5', 't10', 't20', 't45']:
            sum_na = pd.isnull(mu[[x for x in mu.columns if x.endswith(exp)]]).sum(axis=1)
            na[exp] = sum_na
        return na

    #def is_na_count_exceeds_minnonzero_in_one_experiment(self, min_non_zero=4):
    #    na = self.get_na_per_experiment_after_averaging()
    #    return na.max(axis=1) >= min_non_zero 
Example 41
Project: msdas   Author: cokelaer   File: test_readers.py    GNU General Public License v3.0 5 votes vote down vote up
def test_Cleaner():
    c = Cleaner()
    c.df = pd.DataFrame({'A': [0, 1] , 'B':[4,5]})
    assert pd.isnull(c.df).sum().sum() == 0
    c.set_zero_to_na()
    assert pd.isnull(c.df).sum().sum() == 1 
Example 42
Project: FX-RER-Value-Extraction   Author: tsKenneth   File: generic.py    MIT License 5 votes vote down vote up
def isnull(self):
        return isna(self).__finalize__(self) 
Example 43
Project: QuantStudio   Author: Scorpi000   File: ReturnBasedModel.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_move__(self, idt, **kwargs):
        if self._iDT==idt: return 0
        self._iDT = idt
        TargetNAV = self._TargetTable.readData(dts=[idt], ids=self._Output["目标ID"], factor_names=[self.TargetNAV]).iloc[0, :, :].values
        self._Output["目标净值"] = np.r_[self._Output["目标净值"], TargetNAV]
        StyleNAV = self._StyleTable.readData(dts=[idt], ids=self._Output["风格ID"], factor_names=[self.StyleNAV]).iloc[0, :, :].values
        self._Output["风格指数净值"] = np.r_[self._Output["风格指数净值"], StyleNAV]
        if self.CalcDTs:
            if idt not in self.CalcDTs[self._CurCalcInd:]: return 0
            self._CurCalcInd = self.CalcDTs[self._CurCalcInd:].index(idt) + self._CurCalcInd
        else:
            self._CurCalcInd = self._Model.DateTimeIndex
        if self._Output["目标净值"].shape[0]-1<self.MinSummaryWindow: return 0
        StartInd = int(max(0, self._Output["目标净值"].shape[0] - 1 - self.SummaryWindow))
        X = _calcReturn(self._Output["风格指数净值"][StartInd:, :], return_type=self.ReturnType)
        Y = _calcReturn(self._Output["目标净值"][StartInd:, :], return_type=self.ReturnType)
        nTargetID, nStyleID = len(self._Output["目标ID"]), len(self._Output["风格ID"])
        Rsquared = np.full((nTargetID, ), np.nan)
        for i, iID in enumerate(self._Output["目标ID"]):
            iMask = ((np.sum(pd.isnull(X), axis=1)==0) & (pd.notnull(Y[:, i])))
            try:
                iBeta = regressByCVX(Y[:, i], X, weight=None, constraints={"Box": {"ub": np.ones((nStyleID, )), "lb": np.zeros((nStyleID, ))},
                                                                                                                      "LinearEq": {"Aeq": np.ones((1, nStyleID)), "beq": 1}})
            except:
                iBeta = None
            if iBeta is None:
                self._Output["滚动回归系数"][iID].append(np.full((nStyleID, ), np.nan))
            else:
                self._Output["滚动回归系数"][iID].append(iBeta)
                Rsquared[i] = 1 - np.nansum((Y[:, i][iMask] - np.dot(X[iMask], iBeta))**2) / np.nansum((Y[:, i][iMask] - np.nanmean(Y[:, i][iMask]))**2)
        self._Output["滚动回归R平方"].append(Rsquared)
        self._Output["时点"].append(idt)
        return 0 
Example 44
Project: QuantStudio   Author: Scorpi000   File: ReturnBasedModel.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_end__(self):
        if not self._isStarted: return 0
        super().__QS_end__()
        DTs, StyleIDs, TargetIDs = self._Output.pop("时点"), self._Output.pop("风格ID"), self._Output.pop("目标ID")
        nTargetID, nStyleID = len(TargetIDs), len(StyleIDs)
        X = _calcReturn(self._Output["风格指数净值"], return_type=self.ReturnType)
        Y = _calcReturn(self._Output["目标净值"], return_type=self.ReturnType)
        self._Output["全样本回归系数"] = np.full(shape=(nStyleID, nTargetID), fill_value=np.nan)
        self._Output["全样本回归R平方"] = np.full(shape=(nTargetID, ), fill_value=np.nan)
        for i, iID in enumerate(TargetIDs):
            iMask = ((np.sum(pd.isnull(X), axis=1)==0) & (pd.notnull(Y[:, i])))
            try:
                iBeta = regressByCVX(Y[:, i], X, weight=None, constraints={"Box": {"ub": np.ones((nStyleID, )), "lb": np.zeros((nStyleID, ))},
                                                                                                                      "LinearEq": {"Aeq": np.ones((1, nStyleID)), "beq": 1}})
            except:
                iBeta = None
            if iBeta is not None:
                self._Output["全样本回归系数"][:, i] = iBeta
                self._Output["全样本回归R平方"][i] = 1 - np.nansum((Y[:, i][iMask] - np.dot(X[iMask], iBeta))**2) / np.nansum((Y[:, i][iMask] - np.nanmean(Y[:, i][iMask]))**2)
            self._Output["滚动回归系数"][iID] = pd.DataFrame(self._Output["滚动回归系数"][iID], index=DTs, columns=self.StyleIDs)
        self._Output["全样本回归系数"] = pd.DataFrame(self._Output["全样本回归系数"], index=StyleIDs, columns=TargetIDs)
        self._Output["全样本回归R平方"] = pd.DataFrame(self._Output["全样本回归R平方"], index=TargetIDs, columns=["全样本回归R平方"])
        self._Output["滚动回归R平方"] = pd.DataFrame(self._Output["滚动回归R平方"], index=DTs, columns=TargetIDs)
        self._Output["目标净值"] = pd.DataFrame(self._Output["目标净值"], index=self._Model.DateTimeSeries, columns=self.TargetIDs)
        self._Output["风格指数净值"] = pd.DataFrame(self._Output["风格指数净值"], index=self._Model.DateTimeSeries, columns=self.StyleIDs)
        return 0 
Example 45
Project: QuantStudio   Author: Scorpi000   File: BrinsonModel.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_move__(self, idt, **kwargs):
        if self._iDT==idt: return 0
        self._iDT = idt
        PreDT = None
        if self.CalcDTs:
            if idt not in self.CalcDTs[self._CurCalcInd:]: return 0
            self._CurCalcInd = self.CalcDTs[self._CurCalcInd:].index(idt) + self._CurCalcInd
            if self._CurCalcInd>0: PreDT = self.CalcDTs[self._CurCalcInd - 1]
        else:
            self._CurCalcInd = self._Model.DateTimeIndex
            if self._CurCalcInd>0: PreDT = self._Model.DateTimeSeries[self._CurCalcInd - 1]
        if PreDT is None: return 0
        Portfolio = self._FactorTable.readData(factor_names=[self.Portfolio, self.BenchmarkPortfolio], dts=[PreDT], ids=self._IDs).iloc[:, 0, :]
        BenchmarkPortfolio, Portfolio = Portfolio.iloc[:, 1], Portfolio.iloc[:, 0]
        Portfolio[pd.isnull(Portfolio)], BenchmarkPortfolio[pd.isnull(BenchmarkPortfolio)] = 0.0, 0.0
        Price = self._FactorTable.readData(factor_names=[self.PriceFactor], dts=[PreDT, idt], ids=self._IDs).iloc[0]
        Return = Price.iloc[1] / Price.iloc[0] - 1
        Return[pd.isnull(Return)] = 0.0
        GroupData = self._FactorTable.readData(factor_names=[self.GroupFactor], ids=self._IDs, dts=[PreDT]).iloc[0, 0, :]
        AllGroups = pd.unique(GroupData[pd.notnull(GroupData)].values).tolist()
        if GroupData.hasnans: AllGroups.append(None)
        for iGroup in AllGroups:
            if iGroup is None: iMask = pd.isnull(GroupData)
            else: iMask = (GroupData==iGroup)
            iGroup = str(iGroup)
            iPortfolio, iBenchmarkPortfolio = Portfolio[iMask], BenchmarkPortfolio[iMask]
            iGroupWeight, iBenchmarkGroupWeight = iPortfolio.sum(), iBenchmarkPortfolio.sum()
            self._Output["策略组合资产权重"].loc[idt, iGroup] = iGroupWeight
            self._Output["基准组合资产权重"].loc[idt, iGroup] = iBenchmarkGroupWeight
            self._Output["策略组合资产收益"].loc[idt, iGroup] = ((iPortfolio * Return[iMask]).sum() / iGroupWeight if iGroupWeight!=0 else 0.0)
            self._Output["基准组合资产收益"].loc[idt, iGroup] = ((iBenchmarkPortfolio * Return[iMask]).sum() / iBenchmarkGroupWeight if iBenchmarkGroupWeight!=0 else 0.0)
        self._Output["策略组合资产权重"].loc[idt, "现金"] = 1 - self._Output["策略组合资产权重"].loc[idt].iloc[1:].sum()
        self._Output["基准组合资产权重"].loc[idt, "现金"] = 1 - self._Output["基准组合资产权重"].loc[idt].iloc[1:].sum()
        return 0 
Example 46
Project: QuantStudio   Author: Scorpi000   File: Distribution.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_start__(self, mdl, dts, **kwargs):
        if self._isStarted: return ()
        super().__QS_start__(mdl=mdl, dts=dts, **kwargs)
        AllIndustries = pd.unique(self._FactorTable.readData(factor_names=[self.IndustryFactor], dts=self._FactorTable.getDateTime(ifactor_name=self.IndustryFactor), ids=self._FactorTable.getID(ifactor_name=self.IndustryFactor)).iloc[0].values.flatten())
        Mask = pd.isnull(AllIndustries)
        if np.sum(Mask)>0: AllIndustries = AllIndustries[~Mask].tolist()+[None]
        self._Output = {iFactorName:{iIndustry:[] for iIndustry in AllIndustries} for iFactorName in self.TestFactors}
        self._Output["历史平均值"] = {iFactorName:[] for iFactorName in self.TestFactors}
        self._Output["历史标准差"] = {iFactorName:[] for iFactorName in self.TestFactors}
        self._Output["行业分类"] = AllIndustries
        self._Output["时点"] = []
        self._CurCalcInd = 0
        return (self._FactorTable, ) 
Example 47
Project: QuantStudio   Author: Scorpi000   File: Distribution.py    GNU General Public License v3.0 5 votes vote down vote up
def __QS_move__(self, idt, **kwargs):
        if self._iDT==idt: return 0
        self._iDT = idt
        if self.CalcDTs:
            if idt not in self.CalcDTs[self._CurCalcInd:]: return 0
            self._CurCalcInd = self.CalcDTs[self._CurCalcInd:].index(idt) + self._CurCalcInd
        else:
            self._CurCalcInd = self._Model.DateTimeIndex
        IDs = self._FactorTable.getFilteredID(idt=idt, id_filter_str=self.IDFilter)
        FactorExpose = self._FactorTable.readData(dts=[idt], ids=IDs, factor_names=list(self.TestFactors)+[self.IndustryFactor]).iloc[:,0,:]
        IndustryData, FactorExpose = FactorExpose.iloc[:, -1], FactorExpose.iloc[:, :-1].astype("float")
        Threshold = {}
        Mask = {}
        for iFactorName in self.TestFactors:
            Mask[iFactorName] = pd.notnull(FactorExpose[iFactorName])
            if self.Threshold=="中位数":
                Threshold[iFactorName] = FactorExpose[iFactorName].median()
            elif self.Threshold=="平均值":
                Threshold[iFactorName] = FactorExpose[iFactorName].mean()
            elif self.Threshold=="25%分位数":
                Threshold[iFactorName] = FactorExpose[iFactorName].quantile(0.25)
            elif self.Threshold=="75%分位数":
                Threshold[iFactorName] = FactorExpose[iFactorName].quantile(0.75)
        for jIndustry in self._Output["行业分类"]:
            if pd.isnull(jIndustry): jMask = pd.isnull(IndustryData)
            else: jMask = (IndustryData==jIndustry)
            for iFactorName in self.TestFactors:
                ijMask = (jMask & Mask[iFactorName])
                ijNum = ijMask.sum()
                if ijNum!=0: self._Output[iFactorName][jIndustry].append((FactorExpose[iFactorName][ijMask]>=Threshold[iFactorName]).sum()/ijNum)
                else: self._Output[iFactorName][jIndustry].append(np.nan)
        self._Output["时点"].append(idt)
        return 0 
Example 48
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 5 votes vote down vote up
def standardizeRank(data, mask=None, cat_data=None, ascending=True, uniformization=True, perturbation=False, offset=0.5, other_handle='填充None'):
    """Rank 标准化"""
    if other_handle=="保持不变":
        StdData = np.copy(data)
    else:
        StdData = np.empty(data.shape,dtype='float')+np.nan
    if mask is None:
        mask = pd.isnull(StdData)
    if perturbation:
        UniqueData = data[pd.notnull(data)]
        if UniqueData.shape[0]>0:
            UniqueData = np.sort(pd.unique(UniqueData))
            MinDiff = np.min(np.abs(np.diff(UniqueData)))
            data = data+np.random.rand(data.shape[0])*MinDiff*0.01
    CatMasks = maskCategary(data.shape[0],cat_data=cat_data,mask=mask)
    for jCat,jCatMask in CatMasks.items():
        jData = data[jCatMask]
        jNotNaMask = pd.notnull(jData)
        if ascending:
            jRank = np.argsort(np.argsort(jData[jNotNaMask]))
        else:
            jRank = np.argsort(np.argsort(-jData[jNotNaMask]))
        if uniformization:
            jRank = (jRank.astype('float')+offset)/jRank.shape[0]
        else:
            jRank = jRank.astype('float')
        jData[jNotNaMask] = jRank
        StdData[jCatMask] = jData
    return StdData

# 分位数变换(Quantile Transformation)标准化
# data: 待标准化的数据, array; cat_data: 分类数据, array
# ascending: 是否升序, 可选: True, False 
Example 49
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 5 votes vote down vote up
def standardizeDynamicPeer(data, corr_matrix, mask=None, cat_data=None, n_group=10, other_handle='填充None'):
    """动态分组标准化"""
    if mask is None:
        mask = (np.zeros(data.shape)==0)
    if other_handle=="保持不变":
        StdData = np.copy(data)
    else:
        StdData = np.empty(data.shape,dtype='float')+np.nan
    for j in range(data.shape[0]):
        if not mask[j]:
            continue
        jPeerCorr = corr_matrix[j,:]
        jNum = min((n_group,np.sum(jPeerCorr>0.0)))
        jData = None
        if jNum>=2:
            jPeerInds = np.argsort(-jPeerCorr)[:jNum]
            jData = data[jPeerInds]
            if np.sum(pd.notnull(jData))<2:
                jData = None
        if jData is None:
            if cat_data is not None:
                jCat = cat_data[j]
                if pd.notnull(jCat):
                    jCatMask = (cat_data==jCat)
                else:
                    jCatMask = pd.isnull(cat_data)
                jData = data[jCatMask]
            else:
                jData = data
        jStd = np.nanstd(jData)
        jAvg = np.nanmean(jData)
        if jStd==0:
            StdData[j] = 0.0
        else:
            StdData[j] = (data[j]-jAvg)/jStd
    return StdData

# 以之前的值进行缺失值填充
# data: 待填充的数据, array; dts: 时间序列, array; lookback: 如果指定了时间序列 dts 则为回溯的时间, 以秒为单位, 否则为回溯期数 
Example 50
Project: QuantStudio   Author: Scorpi000   File: DataPreprocessingFun.py    GNU General Public License v3.0 5 votes vote down vote up
def fillNaNByVal(data, mask=None, value=0.0):
    StdData = np.copy(data)
    if mask is None:
        StdData[pd.isnull(StdData)] = value
    else:
        StdData[mask & pd.isnull(StdData)] = value
    return StdData
# 某个运算结果进行缺失值填充