Python pandas.isnull() Examples

The following are 30 code examples for showing how to use pandas.isnull(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may want to check out the right sidebar which shows the related API usage.

You may also want to check out all available functions/classes of the module pandas , or try the search function .

Example 1
Project: DETAD   Author: HumamAlwassel   File: sensitivity_analysis.py    License: MIT License 6 votes vote down vote up
def compute_mAP_N(result,this_cls_pred,this_cls_gt):
    ap = np.zeros(len(result.tiou_thresholds))
    tp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))
    fp = np.zeros((len(result.tiou_thresholds), len(this_cls_pred)))

    for tidx, tiou in enumerate(result.tiou_thresholds): 
        fp[tidx,pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values] = 1
        tp[tidx,~(pd.isnull(this_cls_pred[result.matched_gt_id_cols[tidx]]).values)] = 1

    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float)
    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float)
    recall_cumsum = tp_cumsum / len(np.unique(this_cls_gt['gt-id']))
    precision_cumsum = recall_cumsum * result.average_num_instance_per_class / (recall_cumsum * result.average_num_instance_per_class + fp_cumsum)

    for tidx in range(len(result.tiou_thresholds)):
        ap[tidx] = interpolated_prec_rec(precision_cumsum[tidx,:], recall_cumsum[tidx,:])
    
    return ap.mean()

# Initialize true positive and false positive vectors. 
Example 2
Project: naru   Author: naru-project   File: common.py    License: Apache License 2.0 6 votes vote down vote up
def SetDistribution(self, distinct_values):
        """This is all the values this column will ever see."""
        assert self.all_distinct_values is None
        # pd.isnull returns true for both np.nan and np.datetime64('NaT').
        is_nan = pd.isnull(distinct_values)
        contains_nan = np.any(is_nan)
        dv_no_nan = distinct_values[~is_nan]
        # NOTE: np.sort puts NaT values at beginning, and NaN values at end.
        # For our purposes we always add any null value to the beginning.
        vs = np.sort(np.unique(dv_no_nan))
        if contains_nan and np.issubdtype(distinct_values.dtype, np.datetime64):
            vs = np.insert(vs, 0, np.datetime64('NaT'))
        elif contains_nan:
            vs = np.insert(vs, 0, np.nan)
        if self.distribution_size is not None:
            assert len(vs) == self.distribution_size
        self.all_distinct_values = vs
        self.distribution_size = len(vs)
        return self 
Example 3
Project: recordlinkage   Author: J535D165   File: compare.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_vectorized(self, s_left, s_right):

        # Values or agree/disagree
        if self.agree_value == 'value':
            compare = s_left.copy()
            compare[s_left != s_right] = self.disagree_value

        else:
            compare = pandas.Series(self.disagree_value, index=s_left.index)
            compare[s_left == s_right] = self.agree_value

        # Only when disagree value is not identical with the missing value
        if self.disagree_value != self.missing_value:
            compare[(s_left.isnull() | s_right.isnull())] = self.missing_value

        return compare 
Example 4
Project: recordlinkage   Author: J535D165   File: compare.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _compute_frequency(self, col):

        # https://github.com/pydata/pandas/issues/3729
        na_value = 'NAN'
        value_count = col.fillna(na_value)

        c = value_count.groupby(by=value_count).transform('count')
        c = c.astype(numpy.float64)

        if self.normalise:
            c = c / len(col)

        # replace missing values
        c[col.isnull()] = self.missing_value

        return c 
Example 5
Project: recordlinkage   Author: J535D165   File: string.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def jarowinkler_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import jaro_winkler

    def jaro_winkler_apply(x):

        try:
            return jaro_winkler(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(jaro_winkler_apply) 
Example 6
Project: recordlinkage   Author: J535D165   File: string.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import levenshtein_distance

    def levenshtein_apply(x):

        try:
            return 1 - levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(levenshtein_apply) 
Example 7
Project: recordlinkage   Author: J535D165   File: string.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def damerau_levenshtein_similarity(s1, s2):

    conc = pandas.Series(list(zip(s1, s2)))

    from jellyfish import damerau_levenshtein_distance

    def damerau_levenshtein_apply(x):

        try:
            return 1 - damerau_levenshtein_distance(x[0], x[1]) \
                / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err

    return conc.apply(damerau_levenshtein_apply) 
Example 8
Project: sato   Author: megagonlabs   File: type_detection.py    License: Apache License 2.0 6 votes vote down vote up
def detect_integer(e):
    if e == '' or pd.isnull(e): return False

    try:
        if integer_regex.match(e): return True
    except:
        try:
            if float(e).is_integer(): return True
        except:
            try:
                for l in locales:
                    locale.setlocale(locale.LC_all, l)
                    if float(locale.atoi(e)).is_integer(): return True
            except:
                pass
    return False 
Example 9
Project: sato   Author: megagonlabs   File: type_detection.py    License: Apache License 2.0 6 votes vote down vote up
def detect_decimal(e):
    if e == '' or pd.isnull(e): return False

    if decimal_regex.match(e):
        return True
    try:
        d = Decimal(e)
        return True
    except:
        try:
            for l in locales:
                locale.setlocale(locale.LC_all, l)          
                value = locale.atof(e)
                if sys.version_info < (2, 7):
                    value = str(e)
                return Decimal(e)
        except:
            pass
    return False 
Example 10
Project: quail   Author: ContextLab   File: recmat.py    License: MIT License 6 votes vote down vote up
def _recmat_exact(presented, recalled, features):
    lists = presented.index.get_values()
    cols = max(presented.shape[1], recalled.shape[1])
    result = np.empty((presented.shape[0], cols))*np.nan
    for li, l in enumerate(lists):
        p_list = presented.loc[l]
        r_list = recalled.loc[l]
        for i, feature in enumerate(features):
            get_feature = lambda x: np.array(x[feature]) if not np.array(pd.isnull(x['item'])).any() else np.nan
            p = np.vstack(p_list.apply(get_feature).get_values())
            r = r_list.dropna().apply(get_feature).get_values()
            r = np.vstack(list(filter(lambda x: x is not np.nan, r)))
            try:
                m = [np.where((p==x).all(axis=1))[0] for x in r]
            except AttributeError:
                m = []
            result[li, :len(m)] = [x[0]+1 if len(x)>0 else np.nan for x in m]
    return result 
Example 11
Project: recruit   Author: Frank-qlu   File: test_integer.py    License: Apache License 2.0 6 votes vote down vote up
def test_conversions(data_missing):

    # astype to object series
    df = pd.DataFrame({'A': data_missing})
    result = df['A'].astype('object')
    expected = pd.Series(np.array([np.nan, 1], dtype=object), name='A')
    tm.assert_series_equal(result, expected)

    # convert to object ndarray
    # we assert that we are exactly equal
    # including type conversions of scalars
    result = df['A'].astype('object').values
    expected = np.array([np.nan, 1], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    for r, e in zip(result, expected):
        if pd.isnull(r):
            assert pd.isnull(e)
        elif is_integer(r):
            # PY2 can be int or long
            assert r == e
            assert is_integer(e)
        else:
            assert r == e
            assert type(r) == type(e) 
Example 12
Project: psst   Author: power-system-simulation-toolbox   File: __init__.py    License: MIT License 6 votes vote down vote up
def solve(self, solver='glpk', verbose=False, keepfiles=False, resolve=False, **kwargs):
        if solver == 'xpress':
            resolve = True

        solve_model(self._model, solver=solver, verbose=verbose, keepfiles=keepfiles, **kwargs)
        self._results = PSSTResults(self)

        if resolve:
            for t, row in self.results.unit_commitment.iterrows():
                for g, v in row.iteritems():
                    if not pd.isnull(v):
                        self._model.UnitOn[g, t].fixed = True
                        self._model.UnitOn[g, t] = int(float(v))

            solve_model(self._model, solver=solver, verbose=verbose, keepfiles=keepfiles, is_mip=False, **kwargs)
            self._results = PSSTResults(self)

        self._status = 'solved' 
Example 13
Project: cellphonedb   Author: Teichlab   File: test_validator_database_random_entries.py    License: MIT License 6 votes vote down vote up
def test_gene(self):

        dataframe = cellphonedb_app.cellphonedb.database_manager.get_repository(
            'gene').get_all_expanded()

        data_not_match = False

        for gene in gene_entries:
            db_gene = dataframe

            for column_name in gene:
                if gene[column_name] == None:
                    db_gene = db_gene[pd.isnull(db_gene[column_name])]
                else:
                    db_gene = db_gene[db_gene[column_name] == gene[column_name]]

            if (len(db_gene) < 1):
                app_logger.warning('Failed cheking Gene:')
                app_logger.warning('Expected data:')
                app_logger.warning(gene)
                data_not_match = True

        self.assertFalse(data_not_match, 'Some Gene doesnt match') 
Example 14
Project: ffn   Author: pmorissette   File: test_core.py    License: MIT License 6 votes vote down vote up
def test_calc_stats():
    # test twelve_month_win_perc divide by zero
    prices = df.C['2010-10-01':'2011-08-01']
    stats = ffn.calc_stats(prices).stats
    assert pd.isnull(stats['twelve_month_win_perc'])
    prices = df.C['2009-10-01':'2011-08-01']
    stats = ffn.calc_stats(prices).stats
    assert not pd.isnull(stats['twelve_month_win_perc'])

    # test yearly_sharpe divide by zero
    prices = df.C['2009-01-01':'2012-01-01']
    stats = ffn.calc_stats(prices).stats
    assert 'yearly_sharpe' in stats.index

    prices[prices > 0.0] = 1.0
    # throws warnings
    stats = ffn.calc_stats(prices).stats
    assert pd.isnull(stats['yearly_sharpe']) 
Example 15
Project: skutil   Author: tgsmith61591   File: transform.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _mode(x, def_fill=ImputerMixin._def_fill):
    """Get the most common value in a 1d
    H2OFrame. Ties will be handled in a non-specified
    manner.

    Parameters
    ----------

    x : ``H2OFrame``, shape=(n_samples, 1)
        The 1d frame from which to derive the mode
    """
    idx = x.as_data_frame(use_pandas=True)[x.columns[0]].value_counts().index

    # if the most common is null, then return the next most common.
    # if there is no next common (i.e., 100% null) then we return the def_fill
    return idx[0] if not pd.isnull(idx[0]) else idx[1] if idx.shape[0] > 1 else def_fill 
Example 16
Project: vnpy_crypto   Author: birforce   File: test_foreign.py    License: MIT License 6 votes vote down vote up
def test_missing_roundtrip():
    buf = BytesIO()
    dta = np.array([(np.nan, np.inf, "")],
                      dtype=[("double_miss", float), ("float_miss", np.float32),
                              ("string_miss", "a1")])
    writer = StataWriter(buf, dta)
    writer.write_file()
    buf.seek(0)
    dta = genfromdta(buf, missing_flt=np.nan)
    assert_(isnull(dta[0][0]))
    assert_(isnull(dta[0][1]))
    assert_(dta[0][2] == asbytes(""))

    dta = genfromdta(os.path.join(curdir, "results/data_missing.dta"),
            missing_flt=-999)
    assert_(np.all([dta[0][i] == -999 for i in range(5)])) 
Example 17
Project: meterstick   Author: google   File: pdutils.py    License: Apache License 2.0 6 votes vote down vote up
def any_null(obj):
  """Checks if there are any null values in obj.

  Args:
    obj: A scalar, Series, or DataFrame.

  Returns:
    A boolean. True if there are any NaN values in obj.

  Raises:
    ValueError: if obj is not a scalar, Series, or DataFrame
  """
  if np.isscalar(obj):
    return pd.isnull(obj)
  elif isinstance(obj, pd.Series):
    return obj.isnull().any()
  elif isinstance(obj, pd.DataFrame):
    return obj.isnull().values.any()
  else:
    raise ValueError("obj is not a scalar, Series, or DataFrame.") 
Example 18
Project: Computable   Author: ktraunmueller   File: test_resample.py    License: MIT License 6 votes vote down vote up
def test_ohlc_5min(self):
        def _ohlc(group):
            if isnull(group).all():
                return np.repeat(np.nan, 4)
            return [group[0], group.max(), group.min(), group[-1]]

        rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50',
                         freq='10s')
        ts = Series(np.random.randn(len(rng)), index=rng)

        resampled = ts.resample('5min', how='ohlc', closed='right',
                                label='right')

        self.assert_((resampled.ix['1/1/2000 00:00'] == ts[0]).all())

        exp = _ohlc(ts[1:31])
        self.assert_((resampled.ix['1/1/2000 00:05'] == exp).all())

        exp = _ohlc(ts['1/1/2000 5:55:01':])
        self.assert_((resampled.ix['1/1/2000 6:00:00'] == exp).all()) 
Example 19
Project: Computable   Author: ktraunmueller   File: generic.py    License: MIT License 6 votes vote down vote up
def clip_lower(self, threshold):
        """
        Return copy of the input with values below given value truncated

        See also
        --------
        clip

        Returns
        -------
        clipped : same type as input
        """
        if isnull(threshold):
            raise ValueError("Cannot use an NA value as a clip threshold")

        return self.where((self >= threshold) | isnull(self), threshold) 
Example 20
Project: Computable   Author: ktraunmueller   File: stata.py    License: MIT License 6 votes vote down vote up
def _write_data_dates(self):
        convert_dates = self._convert_dates
        data = self.datarows
        byteorder = self._byteorder
        TYPE_MAP = self.TYPE_MAP
        MISSING_VALUES = self.MISSING_VALUES
        typlist = self.typlist
        for row in data:
            #row = row.squeeze().tolist() # needed for structured arrays
            for i, var in enumerate(row):
                typ = ord(typlist[i])
                #NOTE: If anyone finds this terribly slow, there is
                # a vectorized way to convert dates, see genfromdta for going
                # from int to datetime and reverse it. will copy data though
                if i in convert_dates:
                    var = _datetime_to_stata_elapsed(var, self.fmtlist[i])
                if typ <= 244:  # we've got a string
                    if len(var) < typ:
                        var = _pad_bytes(var, typ)
                    self._write(var)
                else:
                    if isnull(var):  # this only matters for floats
                        var = MISSING_VALUES[typ]
                    self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) 
Example 21
Project: git2net   Author: gotec   File: extraction.py    License: GNU Affero General Public License v3.0 5 votes vote down vote up
def identify_file_renaming(git_repo_dir):
    """ Identifies all names and locations different files in a repository have had.

    Args:
        git_repo_dir: path to the git repository that is mined

    Returns:
        dag: pathpy DAG object depicting the renaming process
        aliases: dictionary containing all aliases for all files
    """

    # TODO: Consider corner case where file is renamed and new file with old name is created.
    git_repo = pydriller.GitRepository(git_repo_dir)

    dag = pp.DAG()
    for commit in tqdm(list(git_repo.get_list_commits()), desc='Creating DAG'):
        for modification in commit.modifications:

            if (modification.new_path not in dag.nodes) and \
               (modification.old_path == modification.new_path) and \
               (modification.change_type == pydriller.domain.commit.ModificationType.ADD):
                if modification.new_path not in dag.nodes:
                        dag.add_node(modification.new_path)
            elif modification.old_path != modification.new_path:
                if pd.isnull(modification.old_path):
                    if modification.new_path not in dag.nodes:
                        dag.add_node(modification.new_path)
                elif pd.isnull(modification.new_path):
                    pass
                else:
                    dag.add_edge(modification.new_path, modification.old_path)

    dag.make_acyclic()
    nodes = [k for k, v in dag.nodes.items() if v['indegree'] == 0 and not v['outdegree'] == 0]
    aliases = {z: y[-1] for x in nodes for y in dag.routes_from_node(x) for z in y[:-1]}

    return dag, aliases 
Example 22
Project: MaliciousMacroBot   Author: egaus   File: mmbot.py    License: MIT License 5 votes vote down vote up
def fill_missing_hashes(self, row):
        """
        Checks if there is a null or NaN value for the 'md5' column.  If so, computes it, if not,
        returns original value.  Used to fill in missing md5's in a dataframe.
        :param row: a row of a dataframe with a column named 'md5' and 'filepath'
        :return: for any missing md5 values, computes the hash on the given filepath
        """
        if pd.isnull(row['md5']):
            return self.get_file_hash(row['filepath'])
        else:
            return row['md5'] 
Example 23
Project: naru   Author: naru-project   File: common.py    License: Apache License 2.0 5 votes vote down vote up
def Discretize(col, data=None):
    """Transforms data values into integers using a Column's vocab.

    Args:
        col: the Column.
        data: list-like data to be discretized.  If None, defaults to col.data.

    Returns:
        col_data: discretized version; an np.ndarray of type np.int32.
    """
    # pd.Categorical() does not allow categories be passed in an array
    # containing np.nan.  It makes it a special case to return code -1
    # for NaN values.

    if data is None:
        data = col.data

    # pd.isnull returns true for both np.nan and np.datetime64('NaT').
    isnan = pd.isnull(col.all_distinct_values)
    if isnan.any():
        # We always add nan or nat to the beginning.
        assert isnan.sum() == 1, isnan
        assert isnan[0], isnan

        dvs = col.all_distinct_values[1:]
        bin_ids = pd.Categorical(data, categories=dvs).codes
        assert len(bin_ids) == len(data)

        # Since nan/nat bin_id is supposed to be 0 but pandas returns -1, just
        # add 1 to everybody
        bin_ids = bin_ids + 1
    else:
        # This column has no nan or nat values.
        dvs = col.all_distinct_values
        bin_ids = pd.Categorical(data, categories=dvs).codes
        assert len(bin_ids) == len(data), (len(bin_ids), len(data))

    bin_ids = bin_ids.astype(np.int32, copy=False)
    assert (bin_ids >= 0).all(), (col, data, bin_ids)
    return bin_ids 
Example 24
Project: recordlinkage   Author: J535D165   File: annotation.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _cast_value(value, na_value=None):

        if pd.isnull(value):
            return na_value
        elif type(value).__module__ == np.__name__:
            return value.item()
        else:
            return value 
Example 25
Project: recordlinkage   Author: J535D165   File: compare.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _compute_vectorized(self, s_left, s_right):

        if self.method == 'jaro':
            str_sim_alg = jaro_similarity
        elif self.method in ['jarowinkler', 'jaro_winkler', 'jw']:
            str_sim_alg = jarowinkler_similarity
        elif self.method == 'levenshtein':
            str_sim_alg = levenshtein_similarity
        elif self.method in [
                'dameraulevenshtein', 'damerau_levenshtein', 'dl'
        ]:
            str_sim_alg = damerau_levenshtein_similarity
        elif self.method in ['q_gram', 'qgram']:
            str_sim_alg = qgram_similarity
        elif self.method == 'cosine':
            str_sim_alg = cosine_similarity
        elif self.method in ['smith_waterman', 'smithwaterman', 'sw']:
            str_sim_alg = smith_waterman_similarity
        elif self.method in ['longest_common_substring', 'lcs']:
            str_sim_alg = longest_common_substring_similarity
        else:
            raise ValueError("The algorithm '{}' is not known.".format(
                self.method))

        c = str_sim_alg(s_left, s_right)

        if self.threshold is not None:
            c = c.where((c < self.threshold) | (pandas.isnull(c)), other=1.0)
            c = c.where((c >= self.threshold) | (pandas.isnull(c)), other=0.0)

        c = _fillna(c, self.missing_value)

        return c 
Example 26
Project: pdfplumber   Author: jsvine   File: test-nics-background-checks-2015-11.py    License: MIT License 5 votes vote down vote up
def test_pandas(self):
        page = self.pdf.pages[0]
        cropped = page.crop((0, 80, self.PDF_WIDTH, 485))
        table = cropped.extract_table({
            "horizontal_strategy": "text",
            "explicit_vertical_lines": [
                min(map(itemgetter("x0"), cropped.chars))
            ],
            "intersection_tolerance": 5
        })

        table = pd.DataFrame(table)

        def parse_value(x):
            if pd.isnull(x) or x == "": return None
            return int(x.replace(",", ""))

        table.columns = COLUMNS
        table[table.columns[1:]] = table[table.columns[1:]].applymap(parse_value)

        # [1:] because first column is state name
        for c in COLUMNS[1:]:
            total = table[c].iloc[-1]
            colsum = table[c].sum()
            assert(colsum == (total * 2))

        month_chars = within_bbox(page.chars, (0, 35, self.PDF_WIDTH, 65))
        month_text = collate_chars(month_chars)
        assert(month_text == "November - 2015") 
Example 27
Project: dataiku-contrib   Author: dataiku   File: tableau_utils.py    License: Apache License 2.0 5 votes vote down vote up
def set_date(row, col, val):
    if pd.isnull(val):
        return
        # For some weird reason, the fractional second must be given in 1/10000 second
    row.setDateTime(col, val.year, val.month, val.day, val.hour, val.minute, val.second, val.microsecond / 100) 
Example 28
Project: arctic   Author: man-group   File: numpy_arrays.py    License: GNU Lesser General Public License v2.1 5 votes vote down vote up
def _convert_types(self, a):
        """
        Converts object arrays of strings to numpy string arrays
        """
        # No conversion for scalar type
        if a.dtype != 'object':
            return a, None

        # We can't infer the type of an empty array, so just
        # assume strings
        if len(a) == 0:
            return a.astype('U1'), None

        # Compute a mask of missing values. Replace NaNs and Nones with
        # empty strings so that type inference has a chance.
        mask = pd.isnull(a)
        if mask.sum() > 0:
            a = a.copy()
            np.putmask(a, mask, '')
        else:
            mask = None

        if infer_dtype(a, skipna=False) == 'mixed':
            # assume its a string, otherwise raise an error
            try:
                a = np.array([s.encode('ascii') for s in a])
                a = a.astype('O')
            except:
                raise ValueError("Column of type 'mixed' cannot be converted to string")

        type_ = infer_dtype(a, skipna=False)
        if type_ in ['unicode', 'string']:
            max_len = max_len_string_array(a)
            return a.astype('U{:d}'.format(max_len)), mask
        else:
            raise ValueError('Cannot store arrays with {} dtype'.format(type_)) 
Example 29
Project: quail   Author: ContextLab   File: helpers.py    License: MIT License 5 votes vote down vote up
def check_nan(x):
    y = pd.isnull(x)
    if type(y) is bool:
        return y
    else:
        return False 
Example 30
Project: cgpm   Author: probcomp   File: data.py    License: Apache License 2.0 5 votes vote down vote up
def build_valmap(column):
    uniques = [u for u in sorted(column.unique()) if not pd.isnull(u)]
    return {u:k for k,u in enumerate(uniques)}