Python clean data

60 Python code examples are found related to " clean data". You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: ComplexityData.py From py-ecomplexity with MIT License

7 votes

def clean_data(self, val_errors_flag_input):
        """Clean data to remove non-numeric values, handle NA's and duplicates"""
        # Make sure values are numeric
        self.data.val = pd.to_numeric(
            self.data.val, errors=val_errors_flag_input)
        self.data.set_index(['time', 'loc', 'prod'], inplace=True)
        if self.data.val.isnull().values.any():
            warnings.warn('NaN value(s) present, coercing to zero(es)')
            self.data.val.fillna(0, inplace=True)

        # Remove duplicates
        dups = self.data.index.duplicated()
        if dups.sum() > 0:
            warnings.warn(
                'Duplicate values exist, keeping the first occurrence')
            self.data = self.data[~self.data.index.duplicated()]

Example 2

Source File: zhihu_item.py From FunpySpiderSearchEngine with Apache License 2.0

6 votes

def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"])

Example 3

Source File: lagou_companies.py From webspider with MIT License

6 votes

def clean_lg_company_data(company_dict):
    """
    清洗爬取到的公司信息

    :param company_dict: tornado.util.ObjectDict
    """
    if 'size' in company_dict:
        company_dict.size = company_dict.size.strip()
    if 'finance_stage' in company_dict:
        company_dict.finance_stage = company_dict.finance_stage.strip()
    if 'features' in company_dict:
        company_dict.features = utils.text.to_plaintext(company_dict.features)
    if 'address' in company_dict:
        company_dict.address = utils.text.to_plaintext(company_dict.address)
    if 'introduce' in company_dict:
        company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else ''
        company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN]
    if 'advantage' in company_dict:
        company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage))
        company_dict.advantage = json.dumps(company_dict.advantage)[
            :constants.COMPANY_ADVANTAGE_MAX_LEN]
    if 'industries' in company_dict:
        company_dict.industries = set(re.split(r",|，|、|\s", company_dict.industries))

Example 4

Source File: selectors.py From invana-bot with MIT License

6 votes

def clean_data(elements=None, selector=None):
    """

    This is where are the extracted data will be cleaned up and applied functions and data types as needed.

    :param elements:
    :param selector:
    :return:
    """
    data_type = selector.get("data_type", "RawField")

    if data_type.startswith("List"):
        multiple = True
    else:
        multiple = False

    data_extractor = SelectorExtractor()
    if multiple is True:
        extracted_data = data_extractor.get_list_data(elements=elements)
    else:
        extracted_data = data_extractor.get_single_data(elements=elements)
    data = transform_data(data=extracted_data, data_type=data_type)
    return data

Example 5

Source File: jquery_scrolldepth.py From carebot with MIT License

6 votes

def clean_data(self, data):
        """
        Fix data types, truncate the data, and otherwise make it fit for
        consumption.
        """
        rows = []
        for row in data:
            row[0] = int(row[0]) # Percent depth on page
            row[1] = int(row[1]) # Total users
            row[2] = int(row[2]) # Seconds on page
            rows.append(row)

        # Sort the row data from 10% => 100%
        rows.sort(key=lambda tup: tup[0])

        rows = self.fill_in_max(rows)

        # Only take the first 10 rows.
        truncated = rows[:10]
        return truncated

Example 6

Source File: punctuator.py From keras-punctuator with MIT License

6 votes

def cleanData(inputFile):
    sys.stderr.write("Cleaning data " + inputFile + "\n")
    mappings = OrderedDict([
        (re.compile("['’]"), "'"),
        # (re.compile("' s([" + DOT_LIKE_AND_SPACE + "])"), "'s\g<1>"), # Removes strange text mistake pattern in europarl data.
        (re.compile("n't"), " n't"),
        #(re.compile(" '([^" + DOT_LIKE + "']*)'"), '. \g<1>.'), # Remove quoting apostrophes.
        (re.compile("'([^t])"), " '\g<1>"), # Separate tokens like "'s" "'ll" and so on.
        #(re.compile('\([^)]*\)'), ''), # Removes bracketed.
        (re.compile('[-—]'), ' '), # Dash to space.
        (re.compile('[^a-z0-9A-Z\',\.?! ]'), ' '), # Other unknown to space.
        # (re.compile('^$|^\.$'), ''), # Removes empty line.
    ])
    cleanFile = inputFile + '.clean'
    regexProcess(mappings, inputFile, cleanFile)
    return cleanFile

Example 7

Source File: get_data.py From Short-Text-Summarization with Apache License 2.0

6 votes

def get_clean_data(filepath):
	# use re to delete useless information in data
	list_summary , list_short_text = parser_txt_to_data(filepath)

	def _remove_special_char(m):
		s = m.group(0)
		if s in u'，。！？；“”：《》':
			return s
		return ''

	for i,line in enumerate(list_summary):
		line = re.sub(u'[\(\[（#「【\)\]）#」】]', '', line)
		list_summary[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	for i,line in enumerate(list_short_text):
		line = re.sub(u'[\(\[（#「【\)\]）#」】]', '', line)
		list_short_text[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	#print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
	return list_summary , list_short_text

Example 8

Source File: kanji.py From apex-sigma-core with GNU General Public License v3.0

6 votes

def clean_readings_data(kanji_dict):
    """

    :param kanji_dict:
    :type kanji_dict:
    :return:
    :rtype:
    """
    readings = kanji_dict['readings']
    bad_chars = ['、 ', '、', '\t', ' ']
    rds = {'readings': {'kun': [], 'on': [], 'names': []}}
    for r_type in readings:
        for item in readings[r_type]:
            if item not in bad_chars:
                for char in bad_chars:
                    if char in item:
                        item = item.replace(char, '')
                rds['readings'][r_type].append(item)
    return rds

Example 9

Source File: train_and_predict.py From titanic_machine_learning_example with MIT License

6 votes

def cleanData(data):
  # If fare data is missing, replace it with the average from that class
  data.Fare = data.Fare.map(lambda x: np.nan if x==0 else x)
  classmeans = data.pivot_table('Fare', rows='Pclass', aggfunc='mean')
  data.Fare = data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

  # Turn names into a number representing titles
  data.Name = data.Name.map(lambda x: parseName(x))

  # Covert sex into a numberic value
  data.Sex = data.Sex.apply(lambda sex: 0 if sex == "male" else 1)

  return data


# Load training and test data sets, cleaning them in the process

Example 10

Source File: cli.py From betterlifepsi with MIT License

6 votes

def clean_transaction_data():
    """
    Clean all the transaction data, and keep all master data
    """
    # TODO.xqliu Disable clean of database for production
    from psi.app.service import Info
    database = Info.get_db()
    database.engine.execute("""
        DELETE FROM related_values;
        DELETE FROM inventory_in_out_link;
        DELETE FROM incoming;
        DELETE FROM shipping_line;
        DELETE FROM shipping;
        DELETE FROM expense;
        DELETE FROM receiving_line;
        DELETE FROM receiving;
        DELETE FROM purchase_order_line;
        DELETE FROM purchase_order;
        DELETE FROM sales_order_line;
        DELETE FROM sales_order;
        DELETE FROM inventory_transaction_line;
        DELETE FROM inventory_transaction;
        commit;
    """)

Example 11

Source File: execute.py From olapy with GNU Lesser General Public License v2.1

6 votes

def clean_data(star_schema_df, measures):
        """measure like this: 1 349 is not numeric so we try to transform it to
        1349.

        :param star_schema_df: start schema dataframe
        :param measures: list of measures columns names

        :return: cleaned columns
        """
        if measures:
            for measure in measures:
                if star_schema_df[measure].dtype == object:
                    star_schema_df[measure] = star_schema_df[measure].str.replace(
                        " ", ""
                    )
                    try:
                        star_schema_df[measure] = star_schema_df[measure].astype(
                            "float"
                        )
                    except ValueError:
                        star_schema_df = star_schema_df.drop(measure, 1)
        return star_schema_df

Example 12

Source File: lsi_model.py From aca with MIT License

6 votes

def clean_data( text):

    n_text = []
    text = text.strip()
    p_set = '. , ! : ? ` ` '.split()
    for i in range(len(text)):
        if text[i] not in p_set:
            n_text.append(text[i])
    text = ''.join(n_text)

    stop_list =set('a is are on from for and not to'.split())
    #stop_list =set('a is are on from for and not to that this there these \
    #               those have has been were I you me they can could be do . , : ! ? '.split())
    text = [word for word in text.lower().split() if word not in stop_list]
    #text = [stemmer.stem(t) for t in text]
    return text

Example 13

Source File: LegendaryFish.py From Pirates-Online-Rewritten with BSD 3-Clause "New" or "Revised" License

6 votes

def cleanFishData(self):
        self.staminaValue = self.myData['stamina']
        self.fishStaminaBar['value'] = self.staminaValue
        self.fishStaminaValueLabel.setText(str(int(self.staminaValue)) + ' / ' + str(100))
        self.fishStaminaBar['barColor'] = FishingGlobals.fishingStaminaBarColor[int(self.staminaValue / 100.0 * (len(FishingGlobals.fishingStaminaBarColor) - 1))]
        self.hideStaminaBar()
        taskMgr.remove('updateFishStaminaTask')
        self.lurePosition = None
        self.fishChaseLureSequence.pause()
        self.fishChaseLureSequence.clearToInitial()
        self.lfStruggleSequence.pause()
        self.lfStruggleSequence.clearToInitial()
        if self.aboutToBitingInterval is None:
            return
        self.aboutToBitingInterval.pause()
        return

Example 14

Source File: data_cleaning.py From DataProcessing_Python with MIT License

6 votes

def clean_data(self):
        """Performs standard data cleaning functions
        """

        self.extract_ids()
        self.extract_target()

        self.check_column_names()
        
        self.remove_constant_variables()
        self.convert_columns_to_binary()

        self.check_date_variables()
        self.check_categorical_variables()
        self.encode_categories()

        print("Data is clean and ready!\n")


    ## function for removing columns

Example 15

Source File: cqhttp_helper.py From python-cqhttp with MIT License

6 votes

def clean_data_dir(self, *, data_dir):
        """
        清理数据目录

        ------------

        :param str data_dir: 收到清理的目录名，支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录，如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir') \
            (data_dir=data_dir)

Example 16

Source File: cqhttp_helper.py From python-cqhttp with MIT License

6 votes

def clean_data_dir_async(self, *, data_dir):
        """
        清理数据目录 (异步版本)

        ------------

        :param str data_dir: 收到清理的目录名，支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录，如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir_async') \
            (data_dir=data_dir)

Example 17

Source File: event_study.py From sanpy with MIT License

6 votes

def clean_data(data, events, starting_point):
    """
    Cleans signals that does not have enough pricing data
    """
    events_df = events.copy(deep=True)
    events_df['in_pricesdf'] = 0
    id = 0

    for date, row in events_df.iterrows():
        sid = row.symbol
        if date not in data.index or sid not in data.columns:
            events_df.iloc[id, -1] = 1
            id = id+1
            continue
        event_day = data.index.searchsorted(date)
        hist_index_start = event_day - starting_point
        hist_index_end = event_day + starting_point
        event_window = data.iloc[hist_index_start:hist_index_end][[sid]]
        if event_window.min()[0] == 0 or len(event_window) == 0:
            events_df.iloc[id, -1] = 1
        id = id+1
    return events_df[events_df['in_pricesdf'] == 0]

Example 18

Source File: auxiliary_dataset.py From ZeroShotVideoClassification with Apache License 2.0

6 votes

def clean_data(fnames, labels):
        if not isinstance(fnames[0], str):
            print('Cannot check for broken videos')
            return fnames, labels
        broken_videos_file = 'assets/kinetics_broken_videos.txt'
        if not os.path.exists(broken_videos_file):
            print('Broken video list does not exists')
            return fnames, labels

        t = time()
        with open(broken_videos_file, 'r') as f:
            broken_samples = [r[:-1] for r in f.readlines()]
        data = [x[75:] for x in fnames]
        keep_sample = np.in1d(data, broken_samples) == False
        fnames = np.array(fnames)[keep_sample]
        labels = np.array(labels)[keep_sample]
        print('Broken videos %.2f%% - removing took %.2f' % (100 * (1.0 - keep_sample.mean()), time() - t))
        return fnames, labels

Example 19

Source File: model.py From libhxl-python with The Unlicense

6 votes

def clean_data(
            self, whitespace=[], upper=[], lower=[], date=[], date_format=None,
            number=[], number_format=None, latlon=[], purge=False, queries=[]
    ):
        """Clean data fields."""
        import hxl.filters
        return hxl.filters.CleanDataFilter(
            self,
            whitespace=whitespace,
            upper=upper,
            lower=lower,
            date=date, date_format=date_format,
            number=number, number_format=number_format,
            latlon=latlon,
            purge=purge,
            queries=queries
        )

Example 20

Source File: cron.py From oh-my-rss with MIT License

6 votes

def clean_history_data():
    """
    清除历史数据
    :return:
    """
    logger.info('开始清理历史数据')

    lastweek = datetime.now() - timedelta(days=7)
    last3month = datetime.now() - timedelta(days=90)
    lastyear = datetime.now() - timedelta(days=365)

    # (, 10)，直接删除
    Article.objects.filter(site__star__lt=10, ctime__lte=lastweek).delete()

    # [10, 20)，创建时间超过 3 个月，内容置空
    Article.objects.filter(site__star__gte=10, site__star__lt=20, ctime__lte=last3month).update(content=' ')

    # [20, )，创建时间超过一年，内容置空
    Article.objects.filter(site__star__gte=20, ctime__lte=lastyear).update(content=' ')

    # 压缩数据库
    vacuum_sqlite_db()

    logger.info('历史数据清理完毕')

Example 21

Source File: sdk.py From darwin-sdk with Apache License 2.0

6 votes

def clean_data(self, dataset_name, **kwargs):
        url = self.server_url + self.routes['clean_data'] + urllib.parse.quote(dataset_name, safe='')
        headers = self.get_auth_header()
        parameters = kwargs
        if headers is None:
            return False, "Cannot get Auth token. Please log in."
        r = self.s.post(url, headers=headers, json=parameters)
        if not r.ok and 'Please run analyze data' in r.text:
            print("Raw profile not found. Running analyze_data")
            char_encoding = parameters['char_encoding'] if 'char_encoding' in parameters else 'utf-8'
            r = self.analyze_data(dataset_name, char_encoding=char_encoding)
            if r[0]:
                r = self.s.post(url, headers=headers, json=parameters)
            else:
                return r
        return self.get_return_info(r)

    # Create risk information for a datatset

Example 22

Source File: atlas3.py From ssbio with MIT License

6 votes

def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                                   np.absolute(perc_spearman.correlation))
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]
                indices_to_keep.append(centroid)

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)]

Example 23

Source File: cluster.py From yass with Apache License 2.0

6 votes

def clean_input_data(self):
        # limit clustering to at most 50,000 spikes
        max_spikes = self.CONFIG.cluster.max_n_spikes
        if len(self.spike_times_original)>max_spikes:
            idx_sampled = np.random.choice(
                a=np.arange(len(self.spike_times_original)),
                size=max_spikes,
                replace=False)
            self.spike_times_original = self.spike_times_original[idx_sampled]
        else:
            idx_sampled = np.arange(len(self.spike_times_original))

        # limit indexes away from edge of recording
        idx_inbounds = np.where(np.logical_and(
                        self.spike_times_original>=self.spike_size//2,
                        self.spike_times_original<(self.reader_raw.rec_len-self.spike_size)))[0]
        self.spike_times_original = self.spike_times_original[
            idx_inbounds].astype('int32')

        # clean upsampled ids if available
        if not self.raw_data:
            self.template_ids_in = self.template_ids_in[
                idx_sampled][idx_inbounds].astype('int32')

Example 24

Source File: marriage.py From DataExploration with MIT License

6 votes

def CleanData(resp):
    """Cleans a respondent DataFrame.

    resp: DataFrame of respondents

    Adds columns: agemarry, age, decade, fives
    """
    resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)

    resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
    resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0

    month0 = pd.to_datetime('1899-12-15')
    dates = [month0 + pd.DateOffset(months=cm) 
             for cm in resp.cmbirth]
    resp['year'] = (pd.DatetimeIndex(dates).year - 1900)
    resp['decade'] = resp.year // 10
    resp['fives'] = resp.year // 5

Example 25

Source File: clean.py From santander-product-recommendation-8th-place with MIT License

6 votes

def clean_data(fi, fo, header, suffix):
    head = fi.readline().strip("\n").split(",")
    head = [h.strip('"') for h in head]
    for i, h in enumerate(head):
        if h == "nomprov":
            ip = i
    print(ip)
    n = len(head)
    if header:
        fo.write("%s\n" % ",".join(head))

    print(n)
    for line in fi:
        fields = line.strip("\n").split(",")
        if len(fields) > n:
            prov = fields[ip] + fields[ip+1]
            del fields[ip]
            fields[ip] = prov
        assert len(fields) == n
        fields = [field.strip() for field in fields]
        fo.write("%s%s\n" % (",".join(fields), suffix))

Example 26

Source File: filter.py From Phen2Gene with MIT License

6 votes

def clean_term_data(HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment, subset,property_value):
    HPid = ""
    xref = []
    synonym = []
    is_a = []
    name = ""

    definition = ""
    is_obsolete = False
    replaced_by = []
    consider = []
    alt_id = []

    created_by = ""
    creation_date = ""
    comment = ""
    subset = ""
    property_value = ""

    return (HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment,subset,property_value)

Example 27

Source File: make_mock_json.py From osbs-client with BSD 3-Clause "New" or "Revised" License

6 votes

def clean_data(self, out_data):
        if isinstance(out_data, dict):
            cleaned_data = {}
            for key, data in out_data.items():
                cleaned_data[key] = self.clean_data(data)
            return cleaned_data
        elif isinstance(out_data, list):
            cleaned_data = []
            for data in out_data:
                cleaned_data.append(self.clean_data(data))
            return cleaned_data
        elif isinstance(out_data, str):
            if re.search(self.rh_pattern, out_data):
                return re.sub(self.ex_pattern, "example.com", out_data)
            else:
                return out_data
        else:
            return out_data

Example 28

Source File: movie.py From BoxOfficeMojo with MIT License

6 votes

def clean_data(self):
        """Formats all the extracted data into the appropriate types"""

        for results in self.data["Weekly"]:
            utils.convert_financial_field(results, "Average Per Theatre")
            utils.convert_financial_field(results, "Gross")
            utils.convert_financial_field(results, "Gross To Date")
            utils.convert_percent_field(results, "Week Over Week Change")
            utils.convert_date_field(results, "Week")
            utils.convert_int_field(results, "Rank")
            utils.convert_int_field(results, "Theaters")
            utils.convert_int_field(results, "Theatre Change")
            utils.convert_int_field(results, "Week Number")

        for key, value in self.data.iteritems():
            if "Total Gross" in key or "." in key:
                self.data.pop(key)
                break
        utils.standardize_keys(self.data)

Example 29

Source File: data_cleanup.py From DeepLearning-IDS with MIT License

6 votes

def cleanAllData():
    # inputDataPath = os.path.join(
    #    os.path.dirname(os.path.realpath(__file__)), )
    inputDataPath = '../ProcessedTrafficData'
    outputDataPath = '../NewCleanedData'
    if (not os.path.exists(outputDataPath)):
        os.mkdir(outputDataPath)

    files = os.listdir(inputDataPath)
    for file in files:
        if file.startswith('.'):
            continue
        if os.path.isdir(file):
            continue
        outFile = os.path.join(outputDataPath, file)
        inputFile = os.path.join(inputDataPath, file)
        cleanData(inputFile, outFile)

Example 30

Source File: report_db_accessor_base.py From koku with GNU Affero General Public License v3.0

6 votes

def clean_data(self, data, table_name):
        """Clean data for insertion into database.

        Args:
            data (dict): The data to be cleaned
            table_name (str): The table name the data is associated with

        Returns:
            (dict): The data with values converted to required types

        """
        column_types = self.report_schema.column_types[table_name]

        for key, value in data.items():
            if value is None or value == "":
                data[key] = None
                continue
            if column_types.get(key) == int or column_types.get(key) == "BigIntegerField":
                data[key] = self._convert_value(value, int)
            elif column_types.get(key) == float:
                data[key] = self._convert_value(value, float)
            elif column_types.get(key) == Decimal:
                data[key] = self._convert_value(value, Decimal)

        return data

Example 31

Source File: run_lfads.py From object_detection_with_tensorflow with MIT License

5 votes

def clean_data_dict(data_dict):
  """Add some key/value pairs to the data dict, if they are missing.
  Args:
    data_dict - dictionary containing data for LFADS
  Returns:
    data_dict with some keys filled in, if they are absent.
  """

  keys = ['train_truth', 'train_ext_input', 'valid_data',
          'valid_truth', 'valid_ext_input', 'valid_train']
  for k in keys:
    if k not in data_dict:
      data_dict[k] = None

  return data_dict

Example 32

Source File: eval.py From Det3D with Apache License 2.0

5 votes

def clean_data(gt_anno, dt_anno, current_cls_name, difficulty=None):
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        gt_name = gt_anno["name"][i].lower()
        valid_class = -1
        if gt_name == current_cls_name:
            valid_class = 1
        else:
            valid_class = -1
        ignore = False
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        else:
            ignored_gt.append(-1)
    for i in range(num_dt):
        if dt_anno["name"][i] == current_cls_name:
            valid_class = 1
        else:
            valid_class = -1
        if valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes

Example 33

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

5 votes

def clean_data(self, df, target_var, task_type, name):
        """Cleans a dataset by removing outliers
        Outiers and missing values are replaced by
        median for continuous and mode for categorical

        Arguments:
                df : dask dataframe, The dataframe to be cleaned
                target_var : string, Name of the target variable
                task_type : string, Type of the task at hand
                name : string, Name of the data being cleaned (train or eval)

        Returns:
                df : dask dataframe, Cleaned dataframe
                mean : dask series, mean of each column
                std_dev : dask series, standard deviation of each column
                _csv_defaults : list, list of default value of each column

        """
        mean, median, mode, std_dev = self.calculate_stats(df, target_var)
        df = self.dropping_zero_var_cols(df, target_var, std_dev)
        df = self.impute(df, target_var, median, mode)
        if task_type == 'classification':
            if df[target_var].dtype == 'float64':
                df[target_var] = df[target_var].astype(np.int64)
        dtype_map = {'float64': 0., 'int64': 0, 'object': ''}
        dtype_list = [str(dtype) for dtype in df.dtypes]
        _csv_defaults = [[dtype_map[dtype]] for dtype in dtype_list]
        if name == 'train' and task_type == 'classification':
            self.creating_explainer_lime(df, target_var)
        df.to_csv('/tmp/clean_*_' + str(name) + '.csv', index=False)
        return df, mean, std_dev, _csv_defaults

Example 34

Source File: telescopes.py From pyLIMA with GNU General Public License v3.0

5 votes

def clean_data_magnitude(self):
        """
        Clean outliers of the telescope for the fits. Points are considered as outliers if they
        are 10 mag brighter
        or fainter than the lightcurve median or if nan appears in any columns or errobar higher
        than a 1 mag.

        :return: the cleaned magnitude lightcurve
        :rtype: array_like
        """

        maximum_accepted_precision = 1.0

        index = np.where((~np.isnan(self.lightcurve_magnitude).any(axis=1)) &
                         (np.abs(self.lightcurve_magnitude[:, 2]) <= maximum_accepted_precision))[0]

        lightcurve = self.lightcurve_magnitude[index]

        index = np.where((np.isnan(self.lightcurve_magnitude).any(axis=1)) |
                         (np.abs(self.lightcurve_magnitude[:, 2]) > maximum_accepted_precision))[0]
        if len(index) != 0:
            self.bad_points_magnitude = index
            print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \
                                                                                 'bad_points_magnitude attribute.')

        return lightcurve

Example 35

Source File: telescopes.py From pyLIMA with GNU General Public License v3.0

5 votes

def clean_data_flux(self):
        """
        Clean outliers of the telescope for the fits. Points are considered as outliers if they
        are 10 mag brighter
        or fainter than the lightcurve median or if nan appears in any columns or errobar higher
        than a 1 mag.

        :return: the cleaned magnitude lightcurve
        :rtype: array_like
        """

        maximum_accepted_precision = 1.0
        flux = self.lightcurve_flux[:, 1]
        error_flux = self.lightcurve_flux[:, 2]
        index = np.where(
            (~np.isnan(self.lightcurve_flux).any(axis=1)) & (np.abs(error_flux / flux) <= maximum_accepted_precision) & (flux>0))[
            0]

        lightcurve = self.lightcurve_flux[index]

        index = np.where(
            (np.isnan(self.lightcurve_flux).any(axis=1)) | (np.abs(error_flux / flux) > maximum_accepted_precision) | (flux<=0))[0]
        if len(index) != 0:
            self.bad_points_flux = index
            print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \
                                                                                 'bad_points_flux attribute.')
        return lightcurve

Example 36

Source File: readers.py From PVGeo with BSD 3-Clause "New" or "Revised" License

5 votes

def clean_data_name(data_name, filename):
    """A helper to clean a filename to make a useful data array name"""
    if data_name is None or data_name == '':
        data_name = os.path.splitext(os.path.basename(filename))[0]
    return data_name

Example 37

Source File: taxi.py From code-snippets with Apache License 2.0

5 votes

def clean_raw_data_dict(input_dict, raw_feature_spec):
  """Clean raw data dict."""
  output_dict = {}

  for key in raw_feature_spec:
    if key not in input_dict or not input_dict[key]:
      output_dict[key] = []
    else:
      output_dict[key] = [input_dict[key]]
  return output_dict

Example 38

Source File: filterUtils.py From director with BSD 3-Clause "New" or "Revised" License

5 votes

def cleanPolyData(polyData):
    clean = vtk.vtkCleanPolyData()
    clean.SetInputData(polyData)
    clean.Update()
    return shallowCopy(clean.GetOutput())

Example 39

Source File: evaluation_runner.py From moviegeek with MIT License

5 votes

def clean_data(self, ratings, min_ratings=5):
        self.logger.debug("cleaning data only to contain users with at least {} ratings".format(min_ratings))

        original_size = ratings.shape[0]

        user_count = ratings[['user_id', 'movie_id']]
        user_count = user_count.groupby('user_id').count()
        user_count = user_count.reset_index()
        user_ids = user_count[user_count['movie_id'] > min_ratings]['user_id']

        ratings = ratings[ratings['user_id'].isin(user_ids)]
        new_size = ratings.shape[0]
        self.logger.debug('reduced dataset from {} to {}'.format(original_size, new_size))
        return ratings

Example 40

Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0

5 votes

def load_clean_data(self, file_path):
        """
        Loads pre-defined clean cells from csv

        :param file_path: path to file

        :return: spark dataframe of clean cells
        """
        clean = self.holo_env.spark_session.read.csv(file_path, header=True)
        self.holo_env.dataengine.add_db_table('C_clean', clean, self.dataset)

        return clean

Example 41

Source File: mturk_depth_api_lsp.py From rel_3d_pose with MIT License

5 votes

def cleanAssignmentData( ass_data ):
    _polished_data                  = {}
    _polished_data['worker_id']     = ass_data['_worker_id']
    _polished_data['worker_exp']    = ass_data['_worker_exp']
    _polished_data['assignment_id'] = ass_data['_assignment_id']
    _polished_data['hit_id']        = ass_data['_hit_id']
    _polished_data['response_time'] = ass_data['_hit_rt']
    _polished_data['hit_comment']   = ass_data['_hit_comment']
    _polished_data['hit_it']        = ass_data['_hit_it']
    _polished_data['gui_rating']    = ass_data['_gui_rating']

    _ass_lsp_subj_ids = _mongo_coll_3.find_one({'_amt_hit_id':_polished_data['hit_id']})['_lsp_subjs_ids']
    _polished_data['lsp_subj_ids'] = _ass_lsp_subj_ids

    _trials_results_dict = json.loads( ass_data['_trials_results'] )

    _ass_trials = []

    _error = False
    for key in _trials_results_dict.keys():
        _trial = _trials_results_dict[key]
	_depth = json.loads(_trial['_depth_str'])

	_trial_info = \
        { "depth": _depth,
         'response_time': _trial['_trial_rt'],
         'lsp_subj_id': _trial['_lsp_subj_id'] }
        res_coll_1 = _mongo_coll_1.find_one({ '_lsp_subj_id':_trial['_lsp_subj_id'] })
        _trial_info['img_id'] = res_coll_1['_lsp_img_id']
        _ass_trials.append(_trial_info)

    _polished_data['trials'] = _ass_trials

    return (_polished_data, _error, ass_data['_hit_reject_flag'], ass_data['_hit_flag'])

Example 42

Source File: visualize_preprocessing.py From kaggle-heart with MIT License

5 votes

def clean_image_data(imdata, metadata):
    """
    clean up 4d-tensor of imdata consistently (fix contrast, move upside up, etc...)
    :param imdata:
    :return:
    """

    # normalize contrast
    flat_data = np.concatenate([i.flatten() for i in imdata]).flatten()
    high = np.percentile(flat_data, 95.0)
    low  = np.percentile(flat_data, 5.0)
    print high,low
    for i in xrange(len(imdata)):
        image = imdata[i]
        image = 1.0 * (image - low) / (high - low)
        image = np.clip(image, 0.0, 1.0)
        imdata[i] = image

    return imdata

Example 43

Source File: gas.py From BNAF with MIT License

5 votes

def load_data_and_clean(file):

    data = load_data(file)
    B = get_correlation_numbers(data)

    while np.any(B > 1):
        col_to_remove = np.where(B > 1)[0][0]
        col_name = data.columns[col_to_remove]
        data.drop(col_name, axis=1, inplace=True)
        B = get_correlation_numbers(data)
    # print(data.corr())
    data = (data - data.mean()) / data.std()

    return data

Example 44

Source File: gas.py From BNAF with MIT License

5 votes

def load_data_and_clean_and_split(file):

    data = load_data_and_clean(file).as_matrix()
    N_test = int(0.1 * data.shape[0])
    data_test = data[-N_test:]
    data_train = data[0:-N_test]
    N_validate = int(0.1 * data_train.shape[0])
    data_validate = data_train[-N_validate:]
    data_train = data_train[0:-N_validate]

    return data_train, data_validate, data_test

Example 45

Source File: utils.py From pycrop-yield-prediction with MIT License

5 votes

def load_clean_yield_data(yield_data_filepath):
    """
    Cleans the yield data by making sure any Nan values in the columns we care about
    are removed
    """
    important_columns = ['Year', 'State ANSI', 'County ANSI', 'Value']
    yield_data = pd.read_csv(yield_data_filepath).dropna(subset=important_columns,
                                                         how='any')
    return yield_data

Example 46

Source File: node.py From Auto-PyTorch with Apache License 2.0

5 votes

def clean_fit_data(self):
        node = self
        
        # clear outputs
        while (node is not None):
            node.fit_output = None
            node.predict_output = None
            node = node.child_node

Example 47

Source File: load_and_format_data.py From CDSS with GNU General Public License v3.0

5 votes

def clean_raw_data(raw_data_dir, data_out_dir='/home/ec2-user/cs230/scripts/DeepLearning/RNN/data_final/',
                   out_prefix='rnn_data_', out_suffix='.txt', file_prefix='final_data_dropcols_', file_suffix='.txt',
                   nfiles=1):
    '''
    Cleans raw data from healthrex database pull

    Parameters:
        @raw_data_dir: (string) path to directory containing raw data files
        @data_out_dir: (string) path to directory for writing cleaned data
        @file_prefix: (string) prefix of raw data files
        @file_suffix: (string) suffix of raw data files
        @num_files: (int) how many files to load
    '''
    print("BEGIN CLEAN DATA...")

    for n in range(10, nfiles):
        fpath = raw_data_dir + file_prefix + str(n) + file_suffix
        print("Cleaning {}...".format(fpath))
        opath = data_out_dir + out_prefix + str(n) + out_suffix
        df_total = pd.read_table(fpath, sep='\t', dtype=None, header=2)
        df_total = replace_none_nan(df_total)
        df_total.to_csv(opath, sep='\t', header=True, index=False)

    print("CLEAN DATA COMPLETE")

    return

Example 48

Source File: DewpointChart.py From mvp with MIT License

5 votes

def cleanData(data, test=False):
    '''Flatten structure to three columns'''
    out=[]
    for row in data:
#        print row
        hold={}
        # bin the timestamp into 20 minute groups
        # get only the first 19 characters of the timestamp
        d=UTCStrToLDT(row["start_date"]["timestamp"])
        d=d.replace(second=0, minute=int(math.floor(d.minute/20)))
        hold['timestamp']=str(d)
        hold["name"]=row["subject"]["attribute"]["name"]
        hold["value"]=row["subject"]["attribute"]["value"]
        out.append(hold)
    return out

Example 49

Source File: demand_measures.py From EnergyPATHWAYS with MIT License

5 votes

def clean_data(self):
        if self.input_type == 'total':
            self.savings = self.clean_timeseries('values', inplace=False, time_index_name='year', time_index=self.years)
        else:
            self.remap(map_from='raw_values', map_to='values', converted_geography=GeoMapper.demand_primary_geography, time_index_name='year',lower=-100)

Example 50

Source File: eval.py From nutonomy_pointpillars with MIT License

5 votes

def clean_data(gt_anno, dt_anno, current_class, difficulty):
    CLASS_NAMES = ['car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer']
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    current_cls_name = CLASS_NAMES[current_class].lower()
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        bbox = gt_anno["bbox"][i]
        gt_name = gt_anno["name"][i].lower()
        height = bbox[3] - bbox[1]
        valid_class = -1
        if (gt_name == current_cls_name):
            valid_class = 1
        elif (current_cls_name == "Pedestrian".lower()
              and "Person_sitting".lower() == gt_name):
            valid_class = 0
        elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name):
            valid_class = 0
        else:
            valid_class = -1
        ignore = False
        if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
                or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
                or (height <= MIN_HEIGHT[difficulty])):
            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
            ignore = True
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        elif (valid_class == 0 or (ignore and (valid_class == 1))):
            ignored_gt.append(1)
        else:
            ignored_gt.append(-1)
    # for i in range(num_gt):
        if gt_anno["name"][i] == "DontCare":
            dc_bboxes.append(gt_anno["bbox"][i])
    for i in range(num_dt):
        if (dt_anno["name"][i].lower() == current_cls_name):
            valid_class = 1
        else:
            valid_class = -1
        height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
        if height < MIN_HEIGHT[difficulty]:
            ignored_dt.append(1)
        elif valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes

Example 51

Source File: common.py From xunfengES with GNU General Public License v3.0

5 votes

def cleanPostData(data):
    data = data.replace(" ","") # 去掉空格
    data = data.split("\n") # 去掉换行
    while "" in data:
        data.remove("")
    return ",".join(data)

Example 52

Source File: fh_utils.py From IBATS_HuobiFeeder_old with GNU General Public License v3.0

5 votes

def clean_datetime_remove_time_data(atime):
    """
    将时间对象的 时、分、秒 全部清零
    :param atime:
    :return:
    """
    return datetime(atime.year, atime.month, atime.day)

Example 53

Source File: analyzer.py From uwsgi-sloth with Apache License 2.0

5 votes

def clean_data_by_key(self, key):
        try:
            del self.data[key]
        except KeyError:
            pass

Example 54

Source File: fMRI.py From mmvt with GNU General Public License v3.0

5 votes

def clean_4d_data(args):
    '''
    python -m src.preproc.fMRI -s nmr00474,nmr00502,nmr00515,nmr00603,nmr00609,nmr00626,nmr00629,nmr00650,nmr00657,nmr00669,nmr00674,nmr00681,nmr00683,nmr00692,nmr00698,nmr00710
        -a laus125 -f clean_resting_state_data --template_brain fsaverage5 --fmri_file_template "f.nii*" --remote_subject_dir "/space/franklin/1/users/sx424/mem_flex/subjects/{subject}"'
    '''
    args = fmri.read_cmd_args(dict(
        subject=args.subject,
        atlas=args.atlas,
        function='clean_4d_data',
        fmri_file_template='rest.nii*',
        fsd='rest_linda'
        # template_brain='fsaverage5',
    ))
    pu.run_on_subjects(args, fmri.main)

Example 55

Source File: backend.py From neon with Apache License 2.0

5 votes

def clean_data(self, tensor, layer_mkl):
        """
        For MKL backends to clean mkl data (memory not freed)
        """
        return None