Python clean data
60 Python code examples are found related to "
clean data".
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ComplexityData.py From py-ecomplexity with MIT License | 7 votes |
def clean_data(self, val_errors_flag_input): """Clean data to remove non-numeric values, handle NA's and duplicates""" # Make sure values are numeric self.data.val = pd.to_numeric( self.data.val, errors=val_errors_flag_input) self.data.set_index(['time', 'loc', 'prod'], inplace=True) if self.data.val.isnull().values.any(): warnings.warn('NaN value(s) present, coercing to zero(es)') self.data.val.fillna(0, inplace=True) # Remove duplicates dups = self.data.index.duplicated() if dups.sum() > 0: warnings.warn( 'Duplicate values exist, keeping the first occurrence') self.data = self.data[~self.data.index.duplicated()]
Example 2
Source File: zhihu_item.py From FunpySpiderSearchEngine with Apache License 2.0 | 6 votes |
def clean_data(self): try: self["praise_num"] = extract_num("".join(self["praise_num"])) except BaseException: self["praise_num"] = 0 self["comments_num"] = extract_num("".join(self["comments_num"])) self["create_time"] = datetime.datetime.fromtimestamp( self["create_time"]).strftime(SQL_DATETIME_FORMAT) try: self["update_time"] = datetime.datetime.fromtimestamp( self["update_time"]).strftime(SQL_DATETIME_FORMAT) except: self["update_time"] = self["create_time"] self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT) self["content"] = remove_tags(self["content"])
Example 3
Source File: lagou_companies.py From webspider with MIT License | 6 votes |
def clean_lg_company_data(company_dict): """ 清洗爬取到的公司信息 :param company_dict: tornado.util.ObjectDict """ if 'size' in company_dict: company_dict.size = company_dict.size.strip() if 'finance_stage' in company_dict: company_dict.finance_stage = company_dict.finance_stage.strip() if 'features' in company_dict: company_dict.features = utils.text.to_plaintext(company_dict.features) if 'address' in company_dict: company_dict.address = utils.text.to_plaintext(company_dict.address) if 'introduce' in company_dict: company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else '' company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN] if 'advantage' in company_dict: company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage)) company_dict.advantage = json.dumps(company_dict.advantage)[ :constants.COMPANY_ADVANTAGE_MAX_LEN] if 'industries' in company_dict: company_dict.industries = set(re.split(r",|,|、|\s", company_dict.industries))
Example 4
Source File: selectors.py From invana-bot with MIT License | 6 votes |
def clean_data(elements=None, selector=None): """ This is where are the extracted data will be cleaned up and applied functions and data types as needed. :param elements: :param selector: :return: """ data_type = selector.get("data_type", "RawField") if data_type.startswith("List"): multiple = True else: multiple = False data_extractor = SelectorExtractor() if multiple is True: extracted_data = data_extractor.get_list_data(elements=elements) else: extracted_data = data_extractor.get_single_data(elements=elements) data = transform_data(data=extracted_data, data_type=data_type) return data
Example 5
Source File: jquery_scrolldepth.py From carebot with MIT License | 6 votes |
def clean_data(self, data): """ Fix data types, truncate the data, and otherwise make it fit for consumption. """ rows = [] for row in data: row[0] = int(row[0]) # Percent depth on page row[1] = int(row[1]) # Total users row[2] = int(row[2]) # Seconds on page rows.append(row) # Sort the row data from 10% => 100% rows.sort(key=lambda tup: tup[0]) rows = self.fill_in_max(rows) # Only take the first 10 rows. truncated = rows[:10] return truncated
Example 6
Source File: punctuator.py From keras-punctuator with MIT License | 6 votes |
def cleanData(inputFile): sys.stderr.write("Cleaning data " + inputFile + "\n") mappings = OrderedDict([ (re.compile("['’]"), "'"), # (re.compile("' s([" + DOT_LIKE_AND_SPACE + "])"), "'s\g<1>"), # Removes strange text mistake pattern in europarl data. (re.compile("n't"), " n't"), #(re.compile(" '([^" + DOT_LIKE + "']*)'"), '. \g<1>.'), # Remove quoting apostrophes. (re.compile("'([^t])"), " '\g<1>"), # Separate tokens like "'s" "'ll" and so on. #(re.compile('\([^)]*\)'), ''), # Removes bracketed. (re.compile('[-—]'), ' '), # Dash to space. (re.compile('[^a-z0-9A-Z\',\.?! ]'), ' '), # Other unknown to space. # (re.compile('^$|^\.$'), ''), # Removes empty line. ]) cleanFile = inputFile + '.clean' regexProcess(mappings, inputFile, cleanFile) return cleanFile
Example 7
Source File: get_data.py From Short-Text-Summarization with Apache License 2.0 | 6 votes |
def get_clean_data(filepath): # use re to delete useless information in data list_summary , list_short_text = parser_txt_to_data(filepath) def _remove_special_char(m): s = m.group(0) if s in u',。!?;“”:《》': return s return '' for i,line in enumerate(list_summary): line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line) list_summary[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8') for i,line in enumerate(list_short_text): line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line) list_short_text[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8') #print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text) return list_summary , list_short_text
Example 8
Source File: kanji.py From apex-sigma-core with GNU General Public License v3.0 | 6 votes |
def clean_readings_data(kanji_dict): """ :param kanji_dict: :type kanji_dict: :return: :rtype: """ readings = kanji_dict['readings'] bad_chars = ['、 ', '、', '\t', ' '] rds = {'readings': {'kun': [], 'on': [], 'names': []}} for r_type in readings: for item in readings[r_type]: if item not in bad_chars: for char in bad_chars: if char in item: item = item.replace(char, '') rds['readings'][r_type].append(item) return rds
Example 9
Source File: train_and_predict.py From titanic_machine_learning_example with MIT License | 6 votes |
def cleanData(data): # If fare data is missing, replace it with the average from that class data.Fare = data.Fare.map(lambda x: np.nan if x==0 else x) classmeans = data.pivot_table('Fare', rows='Pclass', aggfunc='mean') data.Fare = data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 ) # Turn names into a number representing titles data.Name = data.Name.map(lambda x: parseName(x)) # Covert sex into a numberic value data.Sex = data.Sex.apply(lambda sex: 0 if sex == "male" else 1) return data # Load training and test data sets, cleaning them in the process
Example 10
Source File: cli.py From betterlifepsi with MIT License | 6 votes |
def clean_transaction_data(): """ Clean all the transaction data, and keep all master data """ # TODO.xqliu Disable clean of database for production from psi.app.service import Info database = Info.get_db() database.engine.execute(""" DELETE FROM related_values; DELETE FROM inventory_in_out_link; DELETE FROM incoming; DELETE FROM shipping_line; DELETE FROM shipping; DELETE FROM expense; DELETE FROM receiving_line; DELETE FROM receiving; DELETE FROM purchase_order_line; DELETE FROM purchase_order; DELETE FROM sales_order_line; DELETE FROM sales_order; DELETE FROM inventory_transaction_line; DELETE FROM inventory_transaction; commit; """)
Example 11
Source File: execute.py From olapy with GNU Lesser General Public License v2.1 | 6 votes |
def clean_data(star_schema_df, measures): """measure like this: 1 349 is not numeric so we try to transform it to 1349. :param star_schema_df: start schema dataframe :param measures: list of measures columns names :return: cleaned columns """ if measures: for measure in measures: if star_schema_df[measure].dtype == object: star_schema_df[measure] = star_schema_df[measure].str.replace( " ", "" ) try: star_schema_df[measure] = star_schema_df[measure].astype( "float" ) except ValueError: star_schema_df = star_schema_df.drop(measure, 1) return star_schema_df
Example 12
Source File: lsi_model.py From aca with MIT License | 6 votes |
def clean_data( text): n_text = [] text = text.strip() p_set = '. , ! : ? ` ` '.split() for i in range(len(text)): if text[i] not in p_set: n_text.append(text[i]) text = ''.join(n_text) stop_list =set('a is are on from for and not to'.split()) #stop_list =set('a is are on from for and not to that this there these \ # those have has been were I you me they can could be do . , : ! ? '.split()) text = [word for word in text.lower().split() if word not in stop_list] #text = [stemmer.stem(t) for t in text] return text
Example 13
Source File: LegendaryFish.py From Pirates-Online-Rewritten with BSD 3-Clause "New" or "Revised" License | 6 votes |
def cleanFishData(self): self.staminaValue = self.myData['stamina'] self.fishStaminaBar['value'] = self.staminaValue self.fishStaminaValueLabel.setText(str(int(self.staminaValue)) + ' / ' + str(100)) self.fishStaminaBar['barColor'] = FishingGlobals.fishingStaminaBarColor[int(self.staminaValue / 100.0 * (len(FishingGlobals.fishingStaminaBarColor) - 1))] self.hideStaminaBar() taskMgr.remove('updateFishStaminaTask') self.lurePosition = None self.fishChaseLureSequence.pause() self.fishChaseLureSequence.clearToInitial() self.lfStruggleSequence.pause() self.lfStruggleSequence.clearToInitial() if self.aboutToBitingInterval is None: return self.aboutToBitingInterval.pause() return
Example 14
Source File: data_cleaning.py From DataProcessing_Python with MIT License | 6 votes |
def clean_data(self): """Performs standard data cleaning functions """ self.extract_ids() self.extract_target() self.check_column_names() self.remove_constant_variables() self.convert_columns_to_binary() self.check_date_variables() self.check_categorical_variables() self.encode_categories() print("Data is clean and ready!\n") ## function for removing columns
Example 15
Source File: cqhttp_helper.py From python-cqhttp with MIT License | 6 votes |
def clean_data_dir(self, *, data_dir): """ 清理数据目录 ------------ :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface` :return: None :rtype: None ------------ 用于清理积攒了太多旧文件的数据目录,如 `image`。 HTTP API v3.3.4 新增 """ return super().__getattr__('clean_data_dir') \ (data_dir=data_dir)
Example 16
Source File: cqhttp_helper.py From python-cqhttp with MIT License | 6 votes |
def clean_data_dir_async(self, *, data_dir): """ 清理数据目录 (异步版本) ------------ :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface` :return: None :rtype: None ------------ 用于清理积攒了太多旧文件的数据目录,如 `image`。 HTTP API v3.3.4 新增 """ return super().__getattr__('clean_data_dir_async') \ (data_dir=data_dir)
Example 17
Source File: event_study.py From sanpy with MIT License | 6 votes |
def clean_data(data, events, starting_point): """ Cleans signals that does not have enough pricing data """ events_df = events.copy(deep=True) events_df['in_pricesdf'] = 0 id = 0 for date, row in events_df.iterrows(): sid = row.symbol if date not in data.index or sid not in data.columns: events_df.iloc[id, -1] = 1 id = id+1 continue event_day = data.index.searchsorted(date) hist_index_start = event_day - starting_point hist_index_end = event_day + starting_point event_window = data.iloc[hist_index_start:hist_index_end][[sid]] if event_window.min()[0] == 0 or len(event_window) == 0: events_df.iloc[id, -1] = 1 id = id+1 return events_df[events_df['in_pricesdf'] == 0]
Example 18
Source File: auxiliary_dataset.py From ZeroShotVideoClassification with Apache License 2.0 | 6 votes |
def clean_data(fnames, labels): if not isinstance(fnames[0], str): print('Cannot check for broken videos') return fnames, labels broken_videos_file = 'assets/kinetics_broken_videos.txt' if not os.path.exists(broken_videos_file): print('Broken video list does not exists') return fnames, labels t = time() with open(broken_videos_file, 'r') as f: broken_samples = [r[:-1] for r in f.readlines()] data = [x[75:] for x in fnames] keep_sample = np.in1d(data, broken_samples) == False fnames = np.array(fnames)[keep_sample] labels = np.array(labels)[keep_sample] print('Broken videos %.2f%% - removing took %.2f' % (100 * (1.0 - keep_sample.mean()), time() - t)) return fnames, labels
Example 19
Source File: model.py From libhxl-python with The Unlicense | 6 votes |
def clean_data( self, whitespace=[], upper=[], lower=[], date=[], date_format=None, number=[], number_format=None, latlon=[], purge=False, queries=[] ): """Clean data fields.""" import hxl.filters return hxl.filters.CleanDataFilter( self, whitespace=whitespace, upper=upper, lower=lower, date=date, date_format=date_format, number=number, number_format=number_format, latlon=latlon, purge=purge, queries=queries )
Example 20
Source File: cron.py From oh-my-rss with MIT License | 6 votes |
def clean_history_data(): """ 清除历史数据 :return: """ logger.info('开始清理历史数据') lastweek = datetime.now() - timedelta(days=7) last3month = datetime.now() - timedelta(days=90) lastyear = datetime.now() - timedelta(days=365) # (, 10),直接删除 Article.objects.filter(site__star__lt=10, ctime__lte=lastweek).delete() # [10, 20),创建时间超过 3 个月,内容置空 Article.objects.filter(site__star__gte=10, site__star__lt=20, ctime__lte=last3month).update(content=' ') # [20, ),创建时间超过一年,内容置空 Article.objects.filter(site__star__gte=20, ctime__lte=lastyear).update(content=' ') # 压缩数据库 vacuum_sqlite_db() logger.info('历史数据清理完毕')
Example 21
Source File: sdk.py From darwin-sdk with Apache License 2.0 | 6 votes |
def clean_data(self, dataset_name, **kwargs): url = self.server_url + self.routes['clean_data'] + urllib.parse.quote(dataset_name, safe='') headers = self.get_auth_header() parameters = kwargs if headers is None: return False, "Cannot get Auth token. Please log in." r = self.s.post(url, headers=headers, json=parameters) if not r.ok and 'Please run analyze data' in r.text: print("Raw profile not found. Running analyze_data") char_encoding = parameters['char_encoding'] if 'char_encoding' in parameters else 'utf-8' r = self.analyze_data(dataset_name, char_encoding=char_encoding) if r[0]: r = self.s.post(url, headers=headers, json=parameters) else: return r return self.get_return_info(r) # Create risk information for a datatset
Example 22
Source File: atlas3.py From ssbio with MIT License | 6 votes |
def clean_data(self, keep_features=None, remove_correlated_feats=True): self.features_df = self.features_df.astype(float).fillna(0) self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)] if keep_features: self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)] if remove_correlated_feats: tmp = self.features_df.T # Remove columns with no variation nunique = tmp.apply(pd.Series.nunique) cols_to_drop = nunique[nunique == 1].index tmp.drop(cols_to_drop, axis=1, inplace=True) perc_spearman = scipy.stats.spearmanr(tmp) abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape), np.absolute(perc_spearman.correlation)) np.fill_diagonal(abs_corr, 0) abs_corr_clean = np.maximum(abs_corr, abs_corr.transpose()) # some floating point mismatches, just make symmetric clustering = linkage(squareform(abs_corr_clean), method='average') clusters = fcluster(clustering, .1, criterion='distance') names = tmp.columns.tolist() names_to_cluster = list(zip(names, clusters)) indices_to_keep = [] ### Extract models closest to cluster centroids for x in range(1, len(set(clusters)) + 1): # Create mask from the list of assignments for extracting submatrix of the cluster mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool) # Take the index of the column with the smallest sum of distances from the submatrix idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :])) # Extract names of cluster elements from names_to_cluster sublist = [name for (name, cluster) in names_to_cluster if cluster == x] # Element closest to centroid centroid = sublist[idx] indices_to_keep.append(centroid) self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)]
Example 23
Source File: cluster.py From yass with Apache License 2.0 | 6 votes |
def clean_input_data(self): # limit clustering to at most 50,000 spikes max_spikes = self.CONFIG.cluster.max_n_spikes if len(self.spike_times_original)>max_spikes: idx_sampled = np.random.choice( a=np.arange(len(self.spike_times_original)), size=max_spikes, replace=False) self.spike_times_original = self.spike_times_original[idx_sampled] else: idx_sampled = np.arange(len(self.spike_times_original)) # limit indexes away from edge of recording idx_inbounds = np.where(np.logical_and( self.spike_times_original>=self.spike_size//2, self.spike_times_original<(self.reader_raw.rec_len-self.spike_size)))[0] self.spike_times_original = self.spike_times_original[ idx_inbounds].astype('int32') # clean upsampled ids if available if not self.raw_data: self.template_ids_in = self.template_ids_in[ idx_sampled][idx_inbounds].astype('int32')
Example 24
Source File: marriage.py From DataExploration with MIT License | 6 votes |
def CleanData(resp): """Cleans a respondent DataFrame. resp: DataFrame of respondents Adds columns: agemarry, age, decade, fives """ resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True) resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0 resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0 month0 = pd.to_datetime('1899-12-15') dates = [month0 + pd.DateOffset(months=cm) for cm in resp.cmbirth] resp['year'] = (pd.DatetimeIndex(dates).year - 1900) resp['decade'] = resp.year // 10 resp['fives'] = resp.year // 5
Example 25
Source File: clean.py From santander-product-recommendation-8th-place with MIT License | 6 votes |
def clean_data(fi, fo, header, suffix): head = fi.readline().strip("\n").split(",") head = [h.strip('"') for h in head] for i, h in enumerate(head): if h == "nomprov": ip = i print(ip) n = len(head) if header: fo.write("%s\n" % ",".join(head)) print(n) for line in fi: fields = line.strip("\n").split(",") if len(fields) > n: prov = fields[ip] + fields[ip+1] del fields[ip] fields[ip] = prov assert len(fields) == n fields = [field.strip() for field in fields] fo.write("%s%s\n" % (",".join(fields), suffix))
Example 26
Source File: filter.py From Phen2Gene with MIT License | 6 votes |
def clean_term_data(HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment, subset,property_value): HPid = "" xref = [] synonym = [] is_a = [] name = "" definition = "" is_obsolete = False replaced_by = [] consider = [] alt_id = [] created_by = "" creation_date = "" comment = "" subset = "" property_value = "" return (HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment,subset,property_value)
Example 27
Source File: make_mock_json.py From osbs-client with BSD 3-Clause "New" or "Revised" License | 6 votes |
def clean_data(self, out_data): if isinstance(out_data, dict): cleaned_data = {} for key, data in out_data.items(): cleaned_data[key] = self.clean_data(data) return cleaned_data elif isinstance(out_data, list): cleaned_data = [] for data in out_data: cleaned_data.append(self.clean_data(data)) return cleaned_data elif isinstance(out_data, str): if re.search(self.rh_pattern, out_data): return re.sub(self.ex_pattern, "example.com", out_data) else: return out_data else: return out_data
Example 28
Source File: movie.py From BoxOfficeMojo with MIT License | 6 votes |
def clean_data(self): """Formats all the extracted data into the appropriate types""" for results in self.data["Weekly"]: utils.convert_financial_field(results, "Average Per Theatre") utils.convert_financial_field(results, "Gross") utils.convert_financial_field(results, "Gross To Date") utils.convert_percent_field(results, "Week Over Week Change") utils.convert_date_field(results, "Week") utils.convert_int_field(results, "Rank") utils.convert_int_field(results, "Theaters") utils.convert_int_field(results, "Theatre Change") utils.convert_int_field(results, "Week Number") for key, value in self.data.iteritems(): if "Total Gross" in key or "." in key: self.data.pop(key) break utils.standardize_keys(self.data)
Example 29
Source File: data_cleanup.py From DeepLearning-IDS with MIT License | 6 votes |
def cleanAllData(): # inputDataPath = os.path.join( # os.path.dirname(os.path.realpath(__file__)), ) inputDataPath = '../ProcessedTrafficData' outputDataPath = '../NewCleanedData' if (not os.path.exists(outputDataPath)): os.mkdir(outputDataPath) files = os.listdir(inputDataPath) for file in files: if file.startswith('.'): continue if os.path.isdir(file): continue outFile = os.path.join(outputDataPath, file) inputFile = os.path.join(inputDataPath, file) cleanData(inputFile, outFile)
Example 30
Source File: report_db_accessor_base.py From koku with GNU Affero General Public License v3.0 | 6 votes |
def clean_data(self, data, table_name): """Clean data for insertion into database. Args: data (dict): The data to be cleaned table_name (str): The table name the data is associated with Returns: (dict): The data with values converted to required types """ column_types = self.report_schema.column_types[table_name] for key, value in data.items(): if value is None or value == "": data[key] = None continue if column_types.get(key) == int or column_types.get(key) == "BigIntegerField": data[key] = self._convert_value(value, int) elif column_types.get(key) == float: data[key] = self._convert_value(value, float) elif column_types.get(key) == Decimal: data[key] = self._convert_value(value, Decimal) return data
Example 31
Source File: run_lfads.py From object_detection_with_tensorflow with MIT License | 5 votes |
def clean_data_dict(data_dict): """Add some key/value pairs to the data dict, if they are missing. Args: data_dict - dictionary containing data for LFADS Returns: data_dict with some keys filled in, if they are absent. """ keys = ['train_truth', 'train_ext_input', 'valid_data', 'valid_truth', 'valid_ext_input', 'valid_train'] for k in keys: if k not in data_dict: data_dict[k] = None return data_dict
Example 32
Source File: eval.py From Det3D with Apache License 2.0 | 5 votes |
def clean_data(gt_anno, dt_anno, current_cls_name, difficulty=None): MIN_HEIGHT = [40, 25, 25] MAX_OCCLUSION = [0, 1, 2] MAX_TRUNCATION = [0.15, 0.3, 0.5] dc_bboxes, ignored_gt, ignored_dt = [], [], [] num_gt = len(gt_anno["name"]) num_dt = len(dt_anno["name"]) num_valid_gt = 0 for i in range(num_gt): gt_name = gt_anno["name"][i].lower() valid_class = -1 if gt_name == current_cls_name: valid_class = 1 else: valid_class = -1 ignore = False if valid_class == 1 and not ignore: ignored_gt.append(0) num_valid_gt += 1 else: ignored_gt.append(-1) for i in range(num_dt): if dt_anno["name"][i] == current_cls_name: valid_class = 1 else: valid_class = -1 if valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
Example 33
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 5 votes |
def clean_data(self, df, target_var, task_type, name): """Cleans a dataset by removing outliers Outiers and missing values are replaced by median for continuous and mode for categorical Arguments: df : dask dataframe, The dataframe to be cleaned target_var : string, Name of the target variable task_type : string, Type of the task at hand name : string, Name of the data being cleaned (train or eval) Returns: df : dask dataframe, Cleaned dataframe mean : dask series, mean of each column std_dev : dask series, standard deviation of each column _csv_defaults : list, list of default value of each column """ mean, median, mode, std_dev = self.calculate_stats(df, target_var) df = self.dropping_zero_var_cols(df, target_var, std_dev) df = self.impute(df, target_var, median, mode) if task_type == 'classification': if df[target_var].dtype == 'float64': df[target_var] = df[target_var].astype(np.int64) dtype_map = {'float64': 0., 'int64': 0, 'object': ''} dtype_list = [str(dtype) for dtype in df.dtypes] _csv_defaults = [[dtype_map[dtype]] for dtype in dtype_list] if name == 'train' and task_type == 'classification': self.creating_explainer_lime(df, target_var) df.to_csv('/tmp/clean_*_' + str(name) + '.csv', index=False) return df, mean, std_dev, _csv_defaults
Example 34
Source File: telescopes.py From pyLIMA with GNU General Public License v3.0 | 5 votes |
def clean_data_magnitude(self): """ Clean outliers of the telescope for the fits. Points are considered as outliers if they are 10 mag brighter or fainter than the lightcurve median or if nan appears in any columns or errobar higher than a 1 mag. :return: the cleaned magnitude lightcurve :rtype: array_like """ maximum_accepted_precision = 1.0 index = np.where((~np.isnan(self.lightcurve_magnitude).any(axis=1)) & (np.abs(self.lightcurve_magnitude[:, 2]) <= maximum_accepted_precision))[0] lightcurve = self.lightcurve_magnitude[index] index = np.where((np.isnan(self.lightcurve_magnitude).any(axis=1)) | (np.abs(self.lightcurve_magnitude[:, 2]) > maximum_accepted_precision))[0] if len(index) != 0: self.bad_points_magnitude = index print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \ 'bad_points_magnitude attribute.') return lightcurve
Example 35
Source File: telescopes.py From pyLIMA with GNU General Public License v3.0 | 5 votes |
def clean_data_flux(self): """ Clean outliers of the telescope for the fits. Points are considered as outliers if they are 10 mag brighter or fainter than the lightcurve median or if nan appears in any columns or errobar higher than a 1 mag. :return: the cleaned magnitude lightcurve :rtype: array_like """ maximum_accepted_precision = 1.0 flux = self.lightcurve_flux[:, 1] error_flux = self.lightcurve_flux[:, 2] index = np.where( (~np.isnan(self.lightcurve_flux).any(axis=1)) & (np.abs(error_flux / flux) <= maximum_accepted_precision) & (flux>0))[ 0] lightcurve = self.lightcurve_flux[index] index = np.where( (np.isnan(self.lightcurve_flux).any(axis=1)) | (np.abs(error_flux / flux) > maximum_accepted_precision) | (flux<=0))[0] if len(index) != 0: self.bad_points_flux = index print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \ 'bad_points_flux attribute.') return lightcurve
Example 36
Source File: readers.py From PVGeo with BSD 3-Clause "New" or "Revised" License | 5 votes |
def clean_data_name(data_name, filename): """A helper to clean a filename to make a useful data array name""" if data_name is None or data_name == '': data_name = os.path.splitext(os.path.basename(filename))[0] return data_name
Example 37
Source File: taxi.py From code-snippets with Apache License 2.0 | 5 votes |
def clean_raw_data_dict(input_dict, raw_feature_spec): """Clean raw data dict.""" output_dict = {} for key in raw_feature_spec: if key not in input_dict or not input_dict[key]: output_dict[key] = [] else: output_dict[key] = [input_dict[key]] return output_dict
Example 38
Source File: filterUtils.py From director with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cleanPolyData(polyData): clean = vtk.vtkCleanPolyData() clean.SetInputData(polyData) clean.Update() return shallowCopy(clean.GetOutput())
Example 39
Source File: evaluation_runner.py From moviegeek with MIT License | 5 votes |
def clean_data(self, ratings, min_ratings=5): self.logger.debug("cleaning data only to contain users with at least {} ratings".format(min_ratings)) original_size = ratings.shape[0] user_count = ratings[['user_id', 'movie_id']] user_count = user_count.groupby('user_id').count() user_count = user_count.reset_index() user_ids = user_count[user_count['movie_id'] > min_ratings]['user_id'] ratings = ratings[ratings['user_id'].isin(user_ids)] new_size = ratings.shape[0] self.logger.debug('reduced dataset from {} to {}'.format(original_size, new_size)) return ratings
Example 40
Source File: holoclean.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def load_clean_data(self, file_path): """ Loads pre-defined clean cells from csv :param file_path: path to file :return: spark dataframe of clean cells """ clean = self.holo_env.spark_session.read.csv(file_path, header=True) self.holo_env.dataengine.add_db_table('C_clean', clean, self.dataset) return clean
Example 41
Source File: mturk_depth_api_lsp.py From rel_3d_pose with MIT License | 5 votes |
def cleanAssignmentData( ass_data ): _polished_data = {} _polished_data['worker_id'] = ass_data['_worker_id'] _polished_data['worker_exp'] = ass_data['_worker_exp'] _polished_data['assignment_id'] = ass_data['_assignment_id'] _polished_data['hit_id'] = ass_data['_hit_id'] _polished_data['response_time'] = ass_data['_hit_rt'] _polished_data['hit_comment'] = ass_data['_hit_comment'] _polished_data['hit_it'] = ass_data['_hit_it'] _polished_data['gui_rating'] = ass_data['_gui_rating'] _ass_lsp_subj_ids = _mongo_coll_3.find_one({'_amt_hit_id':_polished_data['hit_id']})['_lsp_subjs_ids'] _polished_data['lsp_subj_ids'] = _ass_lsp_subj_ids _trials_results_dict = json.loads( ass_data['_trials_results'] ) _ass_trials = [] _error = False for key in _trials_results_dict.keys(): _trial = _trials_results_dict[key] _depth = json.loads(_trial['_depth_str']) _trial_info = \ { "depth": _depth, 'response_time': _trial['_trial_rt'], 'lsp_subj_id': _trial['_lsp_subj_id'] } res_coll_1 = _mongo_coll_1.find_one({ '_lsp_subj_id':_trial['_lsp_subj_id'] }) _trial_info['img_id'] = res_coll_1['_lsp_img_id'] _ass_trials.append(_trial_info) _polished_data['trials'] = _ass_trials return (_polished_data, _error, ass_data['_hit_reject_flag'], ass_data['_hit_flag'])
Example 42
Source File: visualize_preprocessing.py From kaggle-heart with MIT License | 5 votes |
def clean_image_data(imdata, metadata): """ clean up 4d-tensor of imdata consistently (fix contrast, move upside up, etc...) :param imdata: :return: """ # normalize contrast flat_data = np.concatenate([i.flatten() for i in imdata]).flatten() high = np.percentile(flat_data, 95.0) low = np.percentile(flat_data, 5.0) print high,low for i in xrange(len(imdata)): image = imdata[i] image = 1.0 * (image - low) / (high - low) image = np.clip(image, 0.0, 1.0) imdata[i] = image return imdata
Example 43
Source File: gas.py From BNAF with MIT License | 5 votes |
def load_data_and_clean(file): data = load_data(file) B = get_correlation_numbers(data) while np.any(B > 1): col_to_remove = np.where(B > 1)[0][0] col_name = data.columns[col_to_remove] data.drop(col_name, axis=1, inplace=True) B = get_correlation_numbers(data) # print(data.corr()) data = (data - data.mean()) / data.std() return data
Example 44
Source File: gas.py From BNAF with MIT License | 5 votes |
def load_data_and_clean_and_split(file): data = load_data_and_clean(file).as_matrix() N_test = int(0.1 * data.shape[0]) data_test = data[-N_test:] data_train = data[0:-N_test] N_validate = int(0.1 * data_train.shape[0]) data_validate = data_train[-N_validate:] data_train = data_train[0:-N_validate] return data_train, data_validate, data_test
Example 45
Source File: utils.py From pycrop-yield-prediction with MIT License | 5 votes |
def load_clean_yield_data(yield_data_filepath): """ Cleans the yield data by making sure any Nan values in the columns we care about are removed """ important_columns = ['Year', 'State ANSI', 'County ANSI', 'Value'] yield_data = pd.read_csv(yield_data_filepath).dropna(subset=important_columns, how='any') return yield_data
Example 46
Source File: node.py From Auto-PyTorch with Apache License 2.0 | 5 votes |
def clean_fit_data(self): node = self # clear outputs while (node is not None): node.fit_output = None node.predict_output = None node = node.child_node
Example 47
Source File: load_and_format_data.py From CDSS with GNU General Public License v3.0 | 5 votes |
def clean_raw_data(raw_data_dir, data_out_dir='/home/ec2-user/cs230/scripts/DeepLearning/RNN/data_final/', out_prefix='rnn_data_', out_suffix='.txt', file_prefix='final_data_dropcols_', file_suffix='.txt', nfiles=1): ''' Cleans raw data from healthrex database pull Parameters: @raw_data_dir: (string) path to directory containing raw data files @data_out_dir: (string) path to directory for writing cleaned data @file_prefix: (string) prefix of raw data files @file_suffix: (string) suffix of raw data files @num_files: (int) how many files to load ''' print("BEGIN CLEAN DATA...") for n in range(10, nfiles): fpath = raw_data_dir + file_prefix + str(n) + file_suffix print("Cleaning {}...".format(fpath)) opath = data_out_dir + out_prefix + str(n) + out_suffix df_total = pd.read_table(fpath, sep='\t', dtype=None, header=2) df_total = replace_none_nan(df_total) df_total.to_csv(opath, sep='\t', header=True, index=False) print("CLEAN DATA COMPLETE") return
Example 48
Source File: DewpointChart.py From mvp with MIT License | 5 votes |
def cleanData(data, test=False): '''Flatten structure to three columns''' out=[] for row in data: # print row hold={} # bin the timestamp into 20 minute groups # get only the first 19 characters of the timestamp d=UTCStrToLDT(row["start_date"]["timestamp"]) d=d.replace(second=0, minute=int(math.floor(d.minute/20))) hold['timestamp']=str(d) hold["name"]=row["subject"]["attribute"]["name"] hold["value"]=row["subject"]["attribute"]["value"] out.append(hold) return out
Example 49
Source File: demand_measures.py From EnergyPATHWAYS with MIT License | 5 votes |
def clean_data(self): if self.input_type == 'total': self.savings = self.clean_timeseries('values', inplace=False, time_index_name='year', time_index=self.years) else: self.remap(map_from='raw_values', map_to='values', converted_geography=GeoMapper.demand_primary_geography, time_index_name='year',lower=-100)
Example 50
Source File: eval.py From nutonomy_pointpillars with MIT License | 5 votes |
def clean_data(gt_anno, dt_anno, current_class, difficulty): CLASS_NAMES = ['car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer'] MIN_HEIGHT = [40, 25, 25] MAX_OCCLUSION = [0, 1, 2] MAX_TRUNCATION = [0.15, 0.3, 0.5] dc_bboxes, ignored_gt, ignored_dt = [], [], [] current_cls_name = CLASS_NAMES[current_class].lower() num_gt = len(gt_anno["name"]) num_dt = len(dt_anno["name"]) num_valid_gt = 0 for i in range(num_gt): bbox = gt_anno["bbox"][i] gt_name = gt_anno["name"][i].lower() height = bbox[3] - bbox[1] valid_class = -1 if (gt_name == current_cls_name): valid_class = 1 elif (current_cls_name == "Pedestrian".lower() and "Person_sitting".lower() == gt_name): valid_class = 0 elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name): valid_class = 0 else: valid_class = -1 ignore = False if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty]) or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty]) or (height <= MIN_HEIGHT[difficulty])): # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1: ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) num_valid_gt += 1 elif (valid_class == 0 or (ignore and (valid_class == 1))): ignored_gt.append(1) else: ignored_gt.append(-1) # for i in range(num_gt): if gt_anno["name"][i] == "DontCare": dc_bboxes.append(gt_anno["bbox"][i]) for i in range(num_dt): if (dt_anno["name"][i].lower() == current_cls_name): valid_class = 1 else: valid_class = -1 height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1]) if height < MIN_HEIGHT[difficulty]: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
Example 51
Source File: common.py From xunfengES with GNU General Public License v3.0 | 5 votes |
def cleanPostData(data): data = data.replace(" ","") # 去掉空格 data = data.split("\n") # 去掉换行 while "" in data: data.remove("") return ",".join(data)
Example 52
Source File: fh_utils.py From IBATS_HuobiFeeder_old with GNU General Public License v3.0 | 5 votes |
def clean_datetime_remove_time_data(atime): """ 将时间对象的 时、分、秒 全部清零 :param atime: :return: """ return datetime(atime.year, atime.month, atime.day)
Example 53
Source File: analyzer.py From uwsgi-sloth with Apache License 2.0 | 5 votes |
def clean_data_by_key(self, key): try: del self.data[key] except KeyError: pass
Example 54
Source File: fMRI.py From mmvt with GNU General Public License v3.0 | 5 votes |
def clean_4d_data(args): ''' python -m src.preproc.fMRI -s nmr00474,nmr00502,nmr00515,nmr00603,nmr00609,nmr00626,nmr00629,nmr00650,nmr00657,nmr00669,nmr00674,nmr00681,nmr00683,nmr00692,nmr00698,nmr00710 -a laus125 -f clean_resting_state_data --template_brain fsaverage5 --fmri_file_template "f.nii*" --remote_subject_dir "/space/franklin/1/users/sx424/mem_flex/subjects/{subject}"' ''' args = fmri.read_cmd_args(dict( subject=args.subject, atlas=args.atlas, function='clean_4d_data', fmri_file_template='rest.nii*', fsd='rest_linda' # template_brain='fsaverage5', )) pu.run_on_subjects(args, fmri.main)
Example 55
Source File: backend.py From neon with Apache License 2.0 | 5 votes |
def clean_data(self, tensor, layer_mkl): """ For MKL backends to clean mkl data (memory not freed) """ return None