Python pandas.isnull() Examples

The following are 30 code examples of pandas.isnull(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: stata.py    From Computable with MIT License 6 votes vote down vote up
def _write_data_dates(self):
        convert_dates = self._convert_dates
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        MISSING_VALUES = self.MISSING_VALUES
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                #NOTE: If anyone finds this terribly slow, there is
                # a vectorized way to convert dates, see genfromdta for going
                # from int to datetime and reverse it. will copy data though
                if i in convert_dates:
                    var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
                if typ <= 244:  # we've got a string
                    if len(var) < typ:
                        var = _pad_bytes(var, typ)
                    self._write(var)
                else:
                    if isnull(var):  # this only matters for floats
                        var = MISSING_VALUES[typ]
                    self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) 
Example #2
Source File: sensitivity_analysis.py    From DETAD with MIT License 6 votes vote down vote up
def compute_mAP_N(result,this_cls_pred,this_cls_gt):
    ap = np.zeros(len(result.tiou_thresholds))
    tp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))
    fp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))

    for tidx, tiou in enumerate(result.tiou_thresholds): 
        fp[tidx,pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values] = 1
        tp[tidx,~(pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values)] = 1

    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float)
    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float)
    recall_cumsum = tp_cumsum / len(np.unique(this_cls_gt['gt-id']))
    precision_cumsum = recall_cumsum * result.average_num_instance_per_class / (recall_cumsum * result.average_num_instance_per_class + fp_cumsum)

    for tidx in range(len(result.tiou_thresholds)):
        ap[tidx] = interpolated_prec_rec(precision_cumsum[tidx,:], recall_cumsum[tidx,:])
    
    return ap.mean()

# Initialize true positive and false positive vectors. 
Example #3
Source File: common.py    From naru with Apache License 2.0 6 votes vote down vote up
def SetDistribution(self, distinct_values):
        """This is all the values this column will ever see."""
        assert self.all_distinct_values is None
        # pd.isnull returns true for both np.nan and np.datetime64('NaT').
        is_nan = pd.isnull(distinct_values)
        contains_nan = np.any(is_nan)
        dv_no_nan = distinct_values[~is_nan]
        # NOTE: np.sort puts NaT values at beginning, and NaN values at end.
        # For our purposes we always add any null value to the beginning.
        vs = np.sort(np.unique(dv_no_nan))
        if contains_nan and np.issubdtype(distinct_values.dtype, np.datetime64):
            vs = np.insert(vs, 0, np.datetime64('NaT'))
        elif contains_nan:
            vs = np.insert(vs, 0, np.nan)
        if self.distribution_size is not None:
            assert len(vs) == self.distribution_size
        self.all_distinct_values = vs
        self.distribution_size = len(vs)
        return self 
Example #4
Source File: compare.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_frequency(self, col):

        # https://github.com/pydata/pandas/issues/3729
        na_value = 'NAN'
        value_count = col.fillna(na_value)

        c = value_count.groupby(by=value_count).transform('count')
        c = c.astype(numpy.float64)

        if self.normalise:
            c = c / len(col)

        # replace missing values
        c[col.isnull()] = self.missing_value

        return c 
Example #5
Source File: string.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def jarowinkler_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import jaro_winkler

    def jaro_winkler_apply(x):

        try:
            return jaro_winkler(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(jaro_winkler_apply) 
Example #6
Source File: string.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import levenshtein_distance

    def levenshtein_apply(x):

        try:
            return 1 - levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(levenshtein_apply) 
Example #7
Source File: compare.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_vectorized(self, s_left, s_right):

        # Values or agree/disagree
        if self.agree_value == 'value':
            compare = s_left.copy()
            compare[s_left != s_right] = self.disagree_value

        else:
            compare = pandas.Series(self.disagree_value, index=s_left.index)
            compare[s_left == s_right] = self.agree_value

        # Only when disagree value is not identical with the missing value
        if self.disagree_value != self.missing_value:
            compare[(s_left.isnull() | s_right.isnull())] = self.missing_value

        return compare 
Example #8
Source File: string.py    From recordlinkage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def damerau_levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import damerau_levenshtein_distance

    def damerau_levenshtein_apply(x):

        try:
            return 1 - damerau_levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(damerau_levenshtein_apply) 
Example #9
Source File: type_detection.py    From sato with Apache License 2.0 6 votes vote down vote up
def detect_integer(e):
    if e == '' or pd.isnull(e): return False

    try:
        if integer_regex.match(e): return True
    except:
        try:
            if float(e).is_integer(): return True
        except:
            try:
                for l in locales:
                    locale.setlocale(locale.LC_all, l)
                    if float(locale.atoi(e)).is_integer(): return True
            except:
                pass
    return False 
Example #10
Source File: type_detection.py    From sato with Apache License 2.0 6 votes vote down vote up
def detect_decimal(e):
    if e == '' or pd.isnull(e): return False

    if decimal_regex.match(e):
        return True
    try:
        d = Decimal(e)
        return True
    except:
        try:
            for l in locales:
                locale.setlocale(locale.LC_all, l)          
                value = locale.atof(e)
                if sys.version_info < (2, 7):
                    value = str(e)
                return Decimal(e)
        except:
            pass
    return False 
Example #11
Source File: test_integer.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({'A': data_missing})
    result = df['A'].astype('object')
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df['A'].astype('object').values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            # PY2 can be int or long
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e) 
Example #12
Source File: __init__.py    From psst with MIT License 6 votes vote down vote up
def solve(self, solver='glpk', verbose=False, keepfiles=False, resolve=False, **kwargs):
        if solver == 'xpress':
            resolve = True

        solve_model(self._model, solver=solver, verbose=verbose, keepfiles=keepfiles, **kwargs)
        self._results = PSSTResults(self)

        if resolve:
            for t, row in self.results.unit_commitment.iterrows():
                for g, v in row.iteritems():
                    if not pd.isnull(v):
                        self._model.UnitOn[g, t].fixed = True
                        self._model.UnitOn[g, t] = int(float(v))

            solve_model(self._model, solver=solver, verbose=verbose, keepfiles=keepfiles, is_mip=False, **kwargs)
            self._results = PSSTResults(self)

        self._status = 'solved' 
Example #13
Source File: test_validator_database_random_entries.py    From cellphonedb with MIT License 6 votes vote down vote up
def test_gene(self):

        dataframe = cellphonedb_app.cellphonedb.database_manager.get_repository(
            'gene').get_all_expanded()

        data_not_match = False

        for gene in gene_entries:
            db_gene = dataframe

            for column_name in gene:
                if gene[column_name] == None:
                    db_gene = db_gene[pd.isnull(db_gene[column_name])]
                else:
                    db_gene = db_gene[db_gene[column_name] == gene[column_name]]

            if (len(db_gene) < 1):
                app_logger.warning('Failed cheking Gene:')
                app_logger.warning('Expected data:')
                app_logger.warning(gene)
                data_not_match = True

        self.assertFalse(data_not_match, 'Some Gene doesnt match') 
Example #14
Source File: recmat.py    From quail with MIT License 6 votes vote down vote up
def _recmat_exact(presented, recalled, features):
    lists = presented.index.get_values()
    cols = max(presented.shape[1], recalled.shape[1])
    result = np.empty((presented.shape[0], cols))*np.nan
    for li, l in enumerate(lists):
        p_list = presented.loc[l]
        r_list = recalled.loc[l]
        for i, feature in enumerate(features):
            get_feature = lambda x: np.array(x[feature]) if not np.array(pd.isnull(x['item'])).any() else np.nan
            p = np.vstack(p_list.apply(get_feature).get_values())
            r = r_list.dropna().apply(get_feature).get_values()
            r = np.vstack(list(filter(lambda x: x is not np.nan, r)))
            try:
                m = [np.where((p==x).all(axis=1))[0] for x in r]
            except AttributeError:
                m = []
            result[li, :len(m)] = [x[0]+1 if len(x)>0 else np.nan for x in m]
    return result 
Example #15
Source File: test_core.py    From ffn with MIT License 6 votes vote down vote up
def test_calc_stats():
    # test twelve_month_win_perc divide by zero
    prices = df.C['2010-10-01':'2011-08-01']
    stats = ffn.calc_stats(prices).stats
    assert pd.isnull(stats['twelve_month_win_perc'])
    prices = df.C['2009-10-01':'2011-08-01']
    stats = ffn.calc_stats(prices).stats
    assert not pd.isnull(stats['twelve_month_win_perc'])

    # test yearly_sharpe divide by zero
    prices = df.C['2009-01-01':'2012-01-01']
    stats = ffn.calc_stats(prices).stats
    assert 'yearly_sharpe' in stats.index

    prices[prices > 0.0] = 1.0
    # throws warnings
    stats = ffn.calc_stats(prices).stats
    assert pd.isnull(stats['yearly_sharpe']) 
Example #16
Source File: test_foreign.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_missing_roundtrip():
    buf = BytesIO()
    dta = np.array([(np.nan, np.inf, "")],
                      dtype=[("double_miss", float), ("float_miss", np.float32),
                              ("string_miss", "a1")])
    writer = StataWriter(buf, dta)
    writer.write_file()
    buf.seek(0)
    dta = genfromdta(buf, missing_flt=np.nan)
    assert_(isnull(dta[0][0]))
    assert_(isnull(dta[0][1]))
    assert_(dta[0][2] == asbytes(""))

    dta = genfromdta(os.path.join(curdir, "results/data_missing.dta"),
            missing_flt=-999)
    assert_(np.all([dta[0][i] == -999 for i in range(5)])) 
Example #17
Source File: pdutils.py    From meterstick with Apache License 2.0 6 votes vote down vote up
def any_null(obj):
  """Checks if there are any null values in obj.

  Args:
    obj: A scalar, Series, or DataFrame.

  Returns:
    A boolean. True if there are any NaN values in obj.

  Raises:
    ValueError: if obj is not a scalar, Series, or DataFrame
  """
  if np.isscalar(obj):
    return pd.isnull(obj)
  elif isinstance(obj, pd.Series):
    return obj.isnull().any()
  elif isinstance(obj, pd.DataFrame):
    return obj.isnull().values.any()
  else:
    raise ValueError("obj is not a scalar, Series, or DataFrame.") 
Example #18
Source File: test_resample.py    From Computable with MIT License 6 votes vote down vote up
def test_ohlc_5min(self):
        def _ohlc(group):
            if isnull(group).all():
                return np.repeat(np.nan, 4)
            return [group[0], group.max(), group.min(), group[-1]]

        rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50',
                         freq='10s')
        ts = Series(np.random.randn(len(rng)), index=rng)

        resampled = ts.resample('5min', how='ohlc', closed='right',
                                label='right')

        self.assert_((resampled.ix['1/1/2000 00:00'] == ts[0]).all())

        exp = _ohlc(ts[1:31])
        self.assert_((resampled.ix['1/1/2000 00:05'] == exp).all())

        exp = _ohlc(ts['1/1/2000 5:55:01':])
        self.assert_((resampled.ix['1/1/2000 6:00:00'] == exp).all()) 
Example #19
Source File: generic.py    From Computable with MIT License 6 votes vote down vote up
def clip_lower(self, threshold):
        """
        Return copy of the input with values below given value truncated

        See also
        --------
        clip

        Returns
        -------
        clipped : same type as input
        """
        if isnull(threshold):
            raise ValueError("Cannot use an NA value as a clip threshold")

        return self.where((self >= threshold) | isnull(self), threshold) 
Example #20
Source File: transform.py    From skutil with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _mode(x, def_fill=ImputerMixin._def_fill):
    """Get the most common value in a 1d
    H2OFrame. Ties will be handled in a non-specified
    manner.

    Parameters
    ----------

    x : ``H2OFrame``, shape=(n_samples, 1)
        The 1d frame from which to derive the mode
    """
    idx = x.as_data_frame(use_pandas=True)[x.columns[0]].value_counts().index

    # if the most common is null, then return the next most common.
    # if there is no next common (i.e., 100% null) then we return the def_fill
    return idx[0] if not pd.isnull(idx[0]) else idx[1] if idx.shape[0] > 1 else def_fill 
Example #21
Source File: word2vec.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def replace_nan(s):
        if pd.isnull(s)==True:
                s=""
        return s 
Example #22
Source File: grams_and_terms_features.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def replace_nan(s):
        if pd.isnull(s)==True:
                s=""
        return s
#code for attributes creation 
Example #23
Source File: json.py    From Computable with MIT License 5 votes vote down vote up
def _try_convert_to_date(self, data):
        """ try to parse a ndarray like into a date column
            try to coerce object in epoch/iso formats and
            integer/float in epcoh formats, return a boolean if parsing
            was successful """

        # no conversion on empty
        if not len(data):
            return data, False

        new_data = data
        if new_data.dtype == 'object':
            try:
                new_data = data.astype('int64')
            except:
                pass

        # ignore numbers that are out of range
        if issubclass(new_data.dtype.type, np.number):
            in_range = (isnull(new_data.values) | (new_data > self.min_stamp) |
                        (new_data.values == iNaT))
            if not in_range.all():
                return data, False

        date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
        for date_unit in date_units:
            try:
                new_data = to_datetime(new_data, errors='raise',
                                       unit=date_unit)
            except OverflowError:
                continue
            except:
                break
            return new_data, True
        return data, False 
Example #24
Source File: foreign.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def _write_data_dates(self):
        convert_dates = self._convert_dates
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        MISSING_VALUES = self.MISSING_VALUES
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i,var in enumerate(row):
                typ = ord(typlist[i])
                #NOTE: If anyone finds this terribly slow, there is
                # a vectorized way to convert dates, see genfromdta for going
                # from int to datetime and reverse it. will copy data though
                if i in convert_dates:
                    var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
                if typ <= 244: # we've got a string
                    if isnull(var):
                        var = "" # missing string
                    if len(var) < typ:
                        var = _pad_bytes(var, len(var) + 1)
                    self._write(var)
                else:
                    if isnull(var): # this only matters for floats
                        var = MISSING_VALUES[typ]
                    self._write(pack(byteorder+TYPE_MAP[typ], var)) 
Example #25
Source File: Fetch_Data_Stock_US_Daily.py    From StockRecommendSystem with MIT License 5 votes vote down vote up
def judgeNeedPreDownload(root_path, symbol, first_date, from_date, to_date):
    publishDay = pd.Timestamp(queryStockPublishDay(root_path, "DB_STOCK", "SHEET_US", symbol))
    if pd.isnull(publishDay) == False and publishDay == first_date:
        return False

    dateList = judgeOpenDaysInRange(from_date, to_date)
    if len(dateList) > 0:
        lastDay = pd.Timestamp(dateList['date'].index[-1])
        if pd.isnull(publishDay) or lastDay > publishDay: 
            return True
    return False 
Example #26
Source File: grams_and_terms_features.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def replace_nan(s):
        if pd.isnull(s)==True:
                s=""
        return s 
Example #27
Source File: grams_and_terms_features.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def replace_nan(s):
        s=str(s)
        if pd.isnull(s)==True:
                s=""
        return s 
Example #28
Source File: indicator.py    From pinkfish with MIT License 5 votes vote down vote up
def apply(self, row):
        if pd.isnull(row['__sma_slow__']):
            self._r = np.nan
        elif row['__sma_fast__'] > row['__sma_slow__']:
            self._r = self._r + 1 if self._r > 0 else 1
        else:
            self._r = self._r -1 if self._r < 0 else -1
        return self._r 
Example #29
Source File: Fetch_Data_Stock_HK_Daily.py    From StockRecommendSystem with MIT License 5 votes vote down vote up
def judgeNeedPreDownload(dir, symbol, from_date, to_date):
    dateList = judgeOpenDaysInRange(from_date, to_date)
    if len(dateList) > 0:
        publishDay = pd.Timestamp(getStockPublishDay(dir, symbol))
        lastDay = pd.Timestamp(dateList['Date'].index[-1])
        if pd.isnull(publishDay) or lastDay > publishDay: 
            return True
    return False 
Example #30
Source File: generic.py    From Computable with MIT License 5 votes vote down vote up
def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
                   **kwds):
        """
        Percent change over given number of periods

        Parameters
        ----------
        periods : int, default 1
            Periods to shift for forming percent change
        fill_method : str, default 'pad'
            How to handle NAs before computing percent changes
        limit : int, default None
            The number of consecutive NAs to fill before stopping
        freq : DateOffset, timedelta, or offset alias string, optional
            Increment to use from time series API (e.g. 'M' or BDay())

        Returns
        -------
        chg : same type as caller
        """
        # TODO: Not sure if above is correct - need someone to confirm.
        if fill_method is None:
            data = self
        else:
            data = self.fillna(method=fill_method, limit=limit)
        rs = data / data.shift(periods=periods, freq=freq, **kwds) - 1
        if freq is None:
            mask = com.isnull(_values_from_object(self))
            np.putmask(rs.values, mask, np.nan)
        return rs