Python clean data

60 Python code examples are found related to " clean data". You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ComplexityData.py    From py-ecomplexity with MIT License 7 votes vote down vote up
def clean_data(self, val_errors_flag_input):
        """Clean data to remove non-numeric values, handle NA's and duplicates"""
        # Make sure values are numeric
        self.data.val = pd.to_numeric(
            self.data.val, errors=val_errors_flag_input)
        self.data.set_index(['time', 'loc', 'prod'], inplace=True)
        if self.data.val.isnull().values.any():
            warnings.warn('NaN value(s) present, coercing to zero(es)')
            self.data.val.fillna(0, inplace=True)

        # Remove duplicates
        dups = self.data.index.duplicated()
        if dups.sum() > 0:
            warnings.warn(
                'Duplicate values exist, keeping the first occurrence')
            self.data = self.data[~self.data.index.duplicated()] 
Example 2
Source File: zhihu_item.py    From FunpySpiderSearchEngine with Apache License 2.0 6 votes vote down vote up
def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"]) 
Example 3
Source File: lagou_companies.py    From webspider with MIT License 6 votes vote down vote up
def clean_lg_company_data(company_dict):
    """
    清洗爬取到的公司信息

    :param company_dict: tornado.util.ObjectDict
    """
    if 'size' in company_dict:
        company_dict.size = company_dict.size.strip()
    if 'finance_stage' in company_dict:
        company_dict.finance_stage = company_dict.finance_stage.strip()
    if 'features' in company_dict:
        company_dict.features = utils.text.to_plaintext(company_dict.features)
    if 'address' in company_dict:
        company_dict.address = utils.text.to_plaintext(company_dict.address)
    if 'introduce' in company_dict:
        company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else ''
        company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN]
    if 'advantage' in company_dict:
        company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage))
        company_dict.advantage = json.dumps(company_dict.advantage)[
            :constants.COMPANY_ADVANTAGE_MAX_LEN]
    if 'industries' in company_dict:
        company_dict.industries = set(re.split(r",|,|、|\s", company_dict.industries)) 
Example 4
Source File: selectors.py    From invana-bot with MIT License 6 votes vote down vote up
def clean_data(elements=None, selector=None):
    """

    This is where are the extracted data will be cleaned up and applied functions and data types as needed.

    :param elements:
    :param selector:
    :return:
    """
    data_type = selector.get("data_type", "RawField")

    if data_type.startswith("List"):
        multiple = True
    else:
        multiple = False

    data_extractor = SelectorExtractor()
    if multiple is True:
        extracted_data = data_extractor.get_list_data(elements=elements)
    else:
        extracted_data = data_extractor.get_single_data(elements=elements)
    data = transform_data(data=extracted_data, data_type=data_type)
    return data 
Example 5
Source File: jquery_scrolldepth.py    From carebot with MIT License 6 votes vote down vote up
def clean_data(self, data):
        """
        Fix data types, truncate the data, and otherwise make it fit for
        consumption.
        """
        rows = []
        for row in data:
            row[0] = int(row[0]) # Percent depth on page
            row[1] = int(row[1]) # Total users
            row[2] = int(row[2]) # Seconds on page
            rows.append(row)

        # Sort the row data from 10% => 100%
        rows.sort(key=lambda tup: tup[0])

        rows = self.fill_in_max(rows)

        # Only take the first 10 rows.
        truncated = rows[:10]
        return truncated 
Example 6
Source File: punctuator.py    From keras-punctuator with MIT License 6 votes vote down vote up
def cleanData(inputFile):
    sys.stderr.write("Cleaning data " + inputFile + "\n")
    mappings = OrderedDict([
        (re.compile("['’]"), "'"),
        # (re.compile("' s([" + DOT_LIKE_AND_SPACE + "])"), "'s\g<1>"), # Removes strange text mistake pattern in europarl data.
        (re.compile("n't"), " n't"),
        #(re.compile(" '([^" + DOT_LIKE + "']*)'"), '. \g<1>.'), # Remove quoting apostrophes.
        (re.compile("'([^t])"), " '\g<1>"), # Separate tokens like "'s" "'ll" and so on.
        #(re.compile('\([^)]*\)'), ''), # Removes bracketed.
        (re.compile('[-—]'), ' '), # Dash to space.
        (re.compile('[^a-z0-9A-Z\',\.?! ]'), ' '), # Other unknown to space.
        # (re.compile('^$|^\.$'), ''), # Removes empty line.
    ])
    cleanFile = inputFile + '.clean'
    regexProcess(mappings, inputFile, cleanFile)
    return cleanFile 
Example 7
Source File: get_data.py    From Short-Text-Summarization with Apache License 2.0 6 votes vote down vote up
def get_clean_data(filepath):
	# use re to delete useless information in data
	list_summary , list_short_text = parser_txt_to_data(filepath)

	def _remove_special_char(m):
		s = m.group(0)
		if s in u',。!?;“”:《》':
			return s
		return ''

	for i,line in enumerate(list_summary):
		line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line)
		list_summary[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	for i,line in enumerate(list_short_text):
		line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line)
		list_short_text[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	#print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
	return list_summary , list_short_text 
Example 8
Source File: kanji.py    From apex-sigma-core with GNU General Public License v3.0 6 votes vote down vote up
def clean_readings_data(kanji_dict):
    """

    :param kanji_dict:
    :type kanji_dict:
    :return:
    :rtype:
    """
    readings = kanji_dict['readings']
    bad_chars = ['、 ', '、', '\t', ' ']
    rds = {'readings': {'kun': [], 'on': [], 'names': []}}
    for r_type in readings:
        for item in readings[r_type]:
            if item not in bad_chars:
                for char in bad_chars:
                    if char in item:
                        item = item.replace(char, '')
                rds['readings'][r_type].append(item)
    return rds 
Example 9
Source File: train_and_predict.py    From titanic_machine_learning_example with MIT License 6 votes vote down vote up
def cleanData(data):
  # If fare data is missing, replace it with the average from that class
  data.Fare = data.Fare.map(lambda x: np.nan if x==0 else x)
  classmeans = data.pivot_table('Fare', rows='Pclass', aggfunc='mean')
  data.Fare = data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

  # Turn names into a number representing titles
  data.Name = data.Name.map(lambda x: parseName(x))

  # Covert sex into a numberic value
  data.Sex = data.Sex.apply(lambda sex: 0 if sex == "male" else 1)

  return data


# Load training and test data sets, cleaning them in the process 
Example 10
Source File: cli.py    From betterlifepsi with MIT License 6 votes vote down vote up
def clean_transaction_data():
    """
    Clean all the transaction data, and keep all master data
    """
    # TODO.xqliu Disable clean of database for production
    from psi.app.service import Info
    database = Info.get_db()
    database.engine.execute("""
        DELETE FROM related_values;
        DELETE FROM inventory_in_out_link;
        DELETE FROM incoming;
        DELETE FROM shipping_line;
        DELETE FROM shipping;
        DELETE FROM expense;
        DELETE FROM receiving_line;
        DELETE FROM receiving;
        DELETE FROM purchase_order_line;
        DELETE FROM purchase_order;
        DELETE FROM sales_order_line;
        DELETE FROM sales_order;
        DELETE FROM inventory_transaction_line;
        DELETE FROM inventory_transaction;
        commit;
    """) 
Example 11
Source File: execute.py    From olapy with GNU Lesser General Public License v2.1 6 votes vote down vote up
def clean_data(star_schema_df, measures):
        """measure like this: 1 349 is not numeric so we try to transform it to
        1349.

        :param star_schema_df: start schema dataframe
        :param measures: list of measures columns names

        :return: cleaned columns
        """
        if measures:
            for measure in measures:
                if star_schema_df[measure].dtype == object:
                    star_schema_df[measure] = star_schema_df[measure].str.replace(
                        " ", ""
                    )
                    try:
                        star_schema_df[measure] = star_schema_df[measure].astype(
                            "float"
                        )
                    except ValueError:
                        star_schema_df = star_schema_df.drop(measure, 1)
        return star_schema_df 
Example 12
Source File: lsi_model.py    From aca with MIT License 6 votes vote down vote up
def clean_data( text):

    n_text = []
    text = text.strip()
    p_set = '. , ! : ? ` ` '.split()
    for i in range(len(text)):
        if text[i] not in p_set:
            n_text.append(text[i])
    text = ''.join(n_text)

    stop_list =set('a is are on from for and not to'.split())
    #stop_list =set('a is are on from for and not to that this there these \
    #               those have has been were I you me they can could be do . , : ! ? '.split())
    text = [word for word in text.lower().split() if word not in stop_list]
    #text = [stemmer.stem(t) for t in text]
    return text 
Example 13
Source File: LegendaryFish.py    From Pirates-Online-Rewritten with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def cleanFishData(self):
        self.staminaValue = self.myData['stamina']
        self.fishStaminaBar['value'] = self.staminaValue
        self.fishStaminaValueLabel.setText(str(int(self.staminaValue)) + ' / ' + str(100))
        self.fishStaminaBar['barColor'] = FishingGlobals.fishingStaminaBarColor[int(self.staminaValue / 100.0 * (len(FishingGlobals.fishingStaminaBarColor) - 1))]
        self.hideStaminaBar()
        taskMgr.remove('updateFishStaminaTask')
        self.lurePosition = None
        self.fishChaseLureSequence.pause()
        self.fishChaseLureSequence.clearToInitial()
        self.lfStruggleSequence.pause()
        self.lfStruggleSequence.clearToInitial()
        if self.aboutToBitingInterval is None:
            return
        self.aboutToBitingInterval.pause()
        return 
Example 14
Source File: data_cleaning.py    From DataProcessing_Python with MIT License 6 votes vote down vote up
def clean_data(self):
        """Performs standard data cleaning functions
        """

        self.extract_ids()
        self.extract_target()

        self.check_column_names()
        
        self.remove_constant_variables()
        self.convert_columns_to_binary()

        self.check_date_variables()
        self.check_categorical_variables()
        self.encode_categories()

        print("Data is clean and ready!\n")


    ## function for removing columns 
Example 15
Source File: cqhttp_helper.py    From python-cqhttp with MIT License 6 votes vote down vote up
def clean_data_dir(self, *, data_dir):
        """
        清理数据目录

        ------------

        :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录,如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir') \
            (data_dir=data_dir) 
Example 16
Source File: cqhttp_helper.py    From python-cqhttp with MIT License 6 votes vote down vote up
def clean_data_dir_async(self, *, data_dir):
        """
        清理数据目录 (异步版本)

        ------------

        :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录,如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir_async') \
            (data_dir=data_dir) 
Example 17
Source File: event_study.py    From sanpy with MIT License 6 votes vote down vote up
def clean_data(data, events, starting_point):
    """
    Cleans signals that does not have enough pricing data
    """
    events_df = events.copy(deep=True)
    events_df['in_pricesdf'] = 0
    id = 0

    for date, row in events_df.iterrows():
        sid = row.symbol
        if date not in data.index or sid not in data.columns:
            events_df.iloc[id, -1] = 1
            id = id+1
            continue
        event_day = data.index.searchsorted(date)
        hist_index_start = event_day - starting_point
        hist_index_end = event_day + starting_point
        event_window = data.iloc[hist_index_start:hist_index_end][[sid]]
        if event_window.min()[0] == 0 or len(event_window) == 0:
            events_df.iloc[id, -1] = 1
        id = id+1
    return events_df[events_df['in_pricesdf'] == 0] 
Example 18
Source File: auxiliary_dataset.py    From ZeroShotVideoClassification with Apache License 2.0 6 votes vote down vote up
def clean_data(fnames, labels):
        if not isinstance(fnames[0], str):
            print('Cannot check for broken videos')
            return fnames, labels
        broken_videos_file = 'assets/kinetics_broken_videos.txt'
        if not os.path.exists(broken_videos_file):
            print('Broken video list does not exists')
            return fnames, labels

        t = time()
        with open(broken_videos_file, 'r') as f:
            broken_samples = [r[:-1] for r in f.readlines()]
        data = [x[75:] for x in fnames]
        keep_sample = np.in1d(data, broken_samples) == False
        fnames = np.array(fnames)[keep_sample]
        labels = np.array(labels)[keep_sample]
        print('Broken videos %.2f%% - removing took %.2f' % (100 * (1.0 - keep_sample.mean()), time() - t))
        return fnames, labels 
Example 19
Source File: model.py    From libhxl-python with The Unlicense 6 votes vote down vote up
def clean_data(
            self, whitespace=[], upper=[], lower=[], date=[], date_format=None,
            number=[], number_format=None, latlon=[], purge=False, queries=[]
    ):
        """Clean data fields."""
        import hxl.filters
        return hxl.filters.CleanDataFilter(
            self,
            whitespace=whitespace,
            upper=upper,
            lower=lower,
            date=date, date_format=date_format,
            number=number, number_format=number_format,
            latlon=latlon,
            purge=purge,
            queries=queries
        ) 
Example 20
Source File: cron.py    From oh-my-rss with MIT License 6 votes vote down vote up
def clean_history_data():
    """
    清除历史数据
    :return:
    """
    logger.info('开始清理历史数据')

    lastweek = datetime.now() - timedelta(days=7)
    last3month = datetime.now() - timedelta(days=90)
    lastyear = datetime.now() - timedelta(days=365)

    # (, 10),直接删除
    Article.objects.filter(site__star__lt=10, ctime__lte=lastweek).delete()

    # [10, 20),创建时间超过 3 个月,内容置空
    Article.objects.filter(site__star__gte=10, site__star__lt=20, ctime__lte=last3month).update(content=' ')

    # [20, ),创建时间超过一年,内容置空
    Article.objects.filter(site__star__gte=20, ctime__lte=lastyear).update(content=' ')

    # 压缩数据库
    vacuum_sqlite_db()

    logger.info('历史数据清理完毕') 
Example 21
Source File: sdk.py    From darwin-sdk with Apache License 2.0 6 votes vote down vote up
def clean_data(self, dataset_name, **kwargs):
        url = self.server_url + self.routes['clean_data'] + urllib.parse.quote(dataset_name, safe='')
        headers = self.get_auth_header()
        parameters = kwargs
        if headers is None:
            return False, "Cannot get Auth token. Please log in."
        r = self.s.post(url, headers=headers, json=parameters)
        if not r.ok and 'Please run analyze data' in r.text:
            print("Raw profile not found. Running analyze_data")
            char_encoding = parameters['char_encoding'] if 'char_encoding' in parameters else 'utf-8'
            r = self.analyze_data(dataset_name, char_encoding=char_encoding)
            if r[0]:
                r = self.s.post(url, headers=headers, json=parameters)
            else:
                return r
        return self.get_return_info(r)

    # Create risk information for a datatset 
Example 22
Source File: atlas3.py    From ssbio with MIT License 6 votes vote down vote up
def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                                   np.absolute(perc_spearman.correlation))
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]
                indices_to_keep.append(centroid)

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)] 
Example 23
Source File: cluster.py    From yass with Apache License 2.0 6 votes vote down vote up
def clean_input_data(self):
        # limit clustering to at most 50,000 spikes
        max_spikes = self.CONFIG.cluster.max_n_spikes
        if len(self.spike_times_original)>max_spikes:
            idx_sampled = np.random.choice(
                a=np.arange(len(self.spike_times_original)),
                size=max_spikes,
                replace=False)
            self.spike_times_original = self.spike_times_original[idx_sampled]
        else:
            idx_sampled = np.arange(len(self.spike_times_original))

        # limit indexes away from edge of recording
        idx_inbounds = np.where(np.logical_and(
                        self.spike_times_original>=self.spike_size//2,
                        self.spike_times_original<(self.reader_raw.rec_len-self.spike_size)))[0]
        self.spike_times_original = self.spike_times_original[
            idx_inbounds].astype('int32')

        # clean upsampled ids if available
        if not self.raw_data:
            self.template_ids_in = self.template_ids_in[
                idx_sampled][idx_inbounds].astype('int32') 
Example 24
Source File: marriage.py    From DataExploration with MIT License 6 votes vote down vote up
def CleanData(resp):
    """Cleans a respondent DataFrame.

    resp: DataFrame of respondents

    Adds columns: agemarry, age, decade, fives
    """
    resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)

    resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
    resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0

    month0 = pd.to_datetime('1899-12-15')
    dates = [month0 + pd.DateOffset(months=cm) 
             for cm in resp.cmbirth]
    resp['year'] = (pd.DatetimeIndex(dates).year - 1900)
    resp['decade'] = resp.year // 10
    resp['fives'] = resp.year // 5 
Example 25
Source File: clean.py    From santander-product-recommendation-8th-place with MIT License 6 votes vote down vote up
def clean_data(fi, fo, header, suffix):
    head = fi.readline().strip("\n").split(",")
    head = [h.strip('"') for h in head]
    for i, h in enumerate(head):
        if h == "nomprov":
            ip = i
    print(ip)
    n = len(head)
    if header:
        fo.write("%s\n" % ",".join(head))

    print(n)
    for line in fi:
        fields = line.strip("\n").split(",")
        if len(fields) > n:
            prov = fields[ip] + fields[ip+1]
            del fields[ip]
            fields[ip] = prov
        assert len(fields) == n
        fields = [field.strip() for field in fields]
        fo.write("%s%s\n" % (",".join(fields), suffix)) 
Example 26
Source File: filter.py    From Phen2Gene with MIT License 6 votes vote down vote up
def clean_term_data(HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment, subset,property_value):
    HPid = ""
    xref = []
    synonym = []
    is_a = []
    name = ""

    definition = ""
    is_obsolete = False
    replaced_by = []
    consider = []
    alt_id = []

    created_by = ""
    creation_date = ""
    comment = ""
    subset = ""
    property_value = ""

    return (HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment,subset,property_value) 
Example 27
Source File: make_mock_json.py    From osbs-client with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def clean_data(self, out_data):
        if isinstance(out_data, dict):
            cleaned_data = {}
            for key, data in out_data.items():
                cleaned_data[key] = self.clean_data(data)
            return cleaned_data
        elif isinstance(out_data, list):
            cleaned_data = []
            for data in out_data:
                cleaned_data.append(self.clean_data(data))
            return cleaned_data
        elif isinstance(out_data, str):
            if re.search(self.rh_pattern, out_data):
                return re.sub(self.ex_pattern, "example.com", out_data)
            else:
                return out_data
        else:
            return out_data 
Example 28
Source File: movie.py    From BoxOfficeMojo with MIT License 6 votes vote down vote up
def clean_data(self):
        """Formats all the extracted data into the appropriate types"""

        for results in self.data["Weekly"]:
            utils.convert_financial_field(results, "Average Per Theatre")
            utils.convert_financial_field(results, "Gross")
            utils.convert_financial_field(results, "Gross To Date")
            utils.convert_percent_field(results, "Week Over Week Change")
            utils.convert_date_field(results, "Week")
            utils.convert_int_field(results, "Rank")
            utils.convert_int_field(results, "Theaters")
            utils.convert_int_field(results, "Theatre Change")
            utils.convert_int_field(results, "Week Number")

        for key, value in self.data.iteritems():
            if "Total Gross" in key or "." in key:
                self.data.pop(key)
                break
        utils.standardize_keys(self.data) 
Example 29
Source File: data_cleanup.py    From DeepLearning-IDS with MIT License 6 votes vote down vote up
def cleanAllData():
    # inputDataPath = os.path.join(
    #    os.path.dirname(os.path.realpath(__file__)), )
    inputDataPath = '../ProcessedTrafficData'
    outputDataPath = '../NewCleanedData'
    if (not os.path.exists(outputDataPath)):
        os.mkdir(outputDataPath)

    files = os.listdir(inputDataPath)
    for file in files:
        if file.startswith('.'):
            continue
        if os.path.isdir(file):
            continue
        outFile = os.path.join(outputDataPath, file)
        inputFile = os.path.join(inputDataPath, file)
        cleanData(inputFile, outFile) 
Example 30
Source File: report_db_accessor_base.py    From koku with GNU Affero General Public License v3.0 6 votes vote down vote up
def clean_data(self, data, table_name):
        """Clean data for insertion into database.

        Args:
            data (dict): The data to be cleaned
            table_name (str): The table name the data is associated with

        Returns:
            (dict): The data with values converted to required types

        """
        column_types = self.report_schema.column_types[table_name]

        for key, value in data.items():
            if value is None or value == "":
                data[key] = None
                continue
            if column_types.get(key) == int or column_types.get(key) == "BigIntegerField":
                data[key] = self._convert_value(value, int)
            elif column_types.get(key) == float:
                data[key] = self._convert_value(value, float)
            elif column_types.get(key) == Decimal:
                data[key] = self._convert_value(value, Decimal)

        return data 
Example 31
Source File: run_lfads.py    From object_detection_with_tensorflow with MIT License 5 votes vote down vote up
def clean_data_dict(data_dict):
  """Add some key/value pairs to the data dict, if they are missing.
  Args:
    data_dict - dictionary containing data for LFADS
  Returns:
    data_dict with some keys filled in, if they are absent.
  """

  keys = ['train_truth', 'train_ext_input', 'valid_data',
          'valid_truth', 'valid_ext_input', 'valid_train']
  for k in keys:
    if k not in data_dict:
      data_dict[k] = None

  return data_dict 
Example 32
Source File: eval.py    From Det3D with Apache License 2.0 5 votes vote down vote up
def clean_data(gt_anno, dt_anno, current_cls_name, difficulty=None):
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        gt_name = gt_anno["name"][i].lower()
        valid_class = -1
        if gt_name == current_cls_name:
            valid_class = 1
        else:
            valid_class = -1
        ignore = False
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        else:
            ignored_gt.append(-1)
    for i in range(num_dt):
        if dt_anno["name"][i] == current_cls_name:
            valid_class = 1
        else:
            valid_class = -1
        if valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes 
Example 33
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def clean_data(self, df, target_var, task_type, name):
        """Cleans a dataset by removing outliers
        Outiers and missing values are replaced by
        median for continuous and mode for categorical

        Arguments:
                df : dask dataframe, The dataframe to be cleaned
                target_var : string, Name of the target variable
                task_type : string, Type of the task at hand
                name : string, Name of the data being cleaned (train or eval)

        Returns:
                df : dask dataframe, Cleaned dataframe
                mean : dask series, mean of each column
                std_dev : dask series, standard deviation of each column
                _csv_defaults : list, list of default value of each column

        """
        mean, median, mode, std_dev = self.calculate_stats(df, target_var)
        df = self.dropping_zero_var_cols(df, target_var, std_dev)
        df = self.impute(df, target_var, median, mode)
        if task_type == 'classification':
            if df[target_var].dtype == 'float64':
                df[target_var] = df[target_var].astype(np.int64)
        dtype_map = {'float64': 0., 'int64': 0, 'object': ''}
        dtype_list = [str(dtype) for dtype in df.dtypes]
        _csv_defaults = [[dtype_map[dtype]] for dtype in dtype_list]
        if name == 'train' and task_type == 'classification':
            self.creating_explainer_lime(df, target_var)
        df.to_csv('/tmp/clean_*_' + str(name) + '.csv', index=False)
        return df, mean, std_dev, _csv_defaults 
Example 34
Source File: telescopes.py    From pyLIMA with GNU General Public License v3.0 5 votes vote down vote up
def clean_data_magnitude(self):
        """
        Clean outliers of the telescope for the fits. Points are considered as outliers if they
        are 10 mag brighter
        or fainter than the lightcurve median or if nan appears in any columns or errobar higher
        than a 1 mag.

        :return: the cleaned magnitude lightcurve
        :rtype: array_like
        """

        maximum_accepted_precision = 1.0

        index = np.where((~np.isnan(self.lightcurve_magnitude).any(axis=1)) &
                         (np.abs(self.lightcurve_magnitude[:, 2]) <= maximum_accepted_precision))[0]

        lightcurve = self.lightcurve_magnitude[index]

        index = np.where((np.isnan(self.lightcurve_magnitude).any(axis=1)) |
                         (np.abs(self.lightcurve_magnitude[:, 2]) > maximum_accepted_precision))[0]
        if len(index) != 0:
            self.bad_points_magnitude = index
            print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \
                                                                                 'bad_points_magnitude attribute.')

        return lightcurve 
Example 35
Source File: telescopes.py    From pyLIMA with GNU General Public License v3.0 5 votes vote down vote up
def clean_data_flux(self):
        """
        Clean outliers of the telescope for the fits. Points are considered as outliers if they
        are 10 mag brighter
        or fainter than the lightcurve median or if nan appears in any columns or errobar higher
        than a 1 mag.

        :return: the cleaned magnitude lightcurve
        :rtype: array_like
        """

        maximum_accepted_precision = 1.0
        flux = self.lightcurve_flux[:, 1]
        error_flux = self.lightcurve_flux[:, 2]
        index = np.where(
            (~np.isnan(self.lightcurve_flux).any(axis=1)) & (np.abs(error_flux / flux) <= maximum_accepted_precision) & (flux>0))[
            0]

        lightcurve = self.lightcurve_flux[index]

        index = np.where(
            (np.isnan(self.lightcurve_flux).any(axis=1)) | (np.abs(error_flux / flux) > maximum_accepted_precision) | (flux<=0))[0]
        if len(index) != 0:
            self.bad_points_flux = index
            print('pyLIMA found some bad points in the telescope ' + self.name + ', you can found these in the ' \
                                                                                 'bad_points_flux attribute.')
        return lightcurve 
Example 36
Source File: readers.py    From PVGeo with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def clean_data_name(data_name, filename):
    """A helper to clean a filename to make a useful data array name"""
    if data_name is None or data_name == '':
        data_name = os.path.splitext(os.path.basename(filename))[0]
    return data_name 
Example 37
Source File: taxi.py    From code-snippets with Apache License 2.0 5 votes vote down vote up
def clean_raw_data_dict(input_dict, raw_feature_spec):
  """Clean raw data dict."""
  output_dict = {}

  for key in raw_feature_spec:
    if key not in input_dict or not input_dict[key]:
      output_dict[key] = []
    else:
      output_dict[key] = [input_dict[key]]
  return output_dict 
Example 38
Source File: filterUtils.py    From director with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cleanPolyData(polyData):
    clean = vtk.vtkCleanPolyData()
    clean.SetInputData(polyData)
    clean.Update()
    return shallowCopy(clean.GetOutput()) 
Example 39
Source File: evaluation_runner.py    From moviegeek with MIT License 5 votes vote down vote up
def clean_data(self, ratings, min_ratings=5):
        self.logger.debug("cleaning data only to contain users with at least {} ratings".format(min_ratings))

        original_size = ratings.shape[0]

        user_count = ratings[['user_id', 'movie_id']]
        user_count = user_count.groupby('user_id').count()
        user_count = user_count.reset_index()
        user_ids = user_count[user_count['movie_id'] > min_ratings]['user_id']

        ratings = ratings[ratings['user_id'].isin(user_ids)]
        new_size = ratings.shape[0]
        self.logger.debug('reduced dataset from {} to {}'.format(original_size, new_size))
        return ratings 
Example 40
Source File: holoclean.py    From HoloClean-Legacy-deprecated with Apache License 2.0 5 votes vote down vote up
def load_clean_data(self, file_path):
        """
        Loads pre-defined clean cells from csv

        :param file_path: path to file

        :return: spark dataframe of clean cells
        """
        clean = self.holo_env.spark_session.read.csv(file_path, header=True)
        self.holo_env.dataengine.add_db_table('C_clean', clean, self.dataset)

        return clean 
Example 41
Source File: mturk_depth_api_lsp.py    From rel_3d_pose with MIT License 5 votes vote down vote up
def cleanAssignmentData( ass_data ):
    _polished_data                  = {}
    _polished_data['worker_id']     = ass_data['_worker_id']
    _polished_data['worker_exp']    = ass_data['_worker_exp']
    _polished_data['assignment_id'] = ass_data['_assignment_id']
    _polished_data['hit_id']        = ass_data['_hit_id']
    _polished_data['response_time'] = ass_data['_hit_rt']
    _polished_data['hit_comment']   = ass_data['_hit_comment']
    _polished_data['hit_it']        = ass_data['_hit_it']
    _polished_data['gui_rating']    = ass_data['_gui_rating']

    _ass_lsp_subj_ids = _mongo_coll_3.find_one({'_amt_hit_id':_polished_data['hit_id']})['_lsp_subjs_ids']
    _polished_data['lsp_subj_ids'] = _ass_lsp_subj_ids

    _trials_results_dict = json.loads( ass_data['_trials_results'] )

    _ass_trials = []

    _error = False
    for key in _trials_results_dict.keys():
        _trial = _trials_results_dict[key]
	_depth = json.loads(_trial['_depth_str'])

	_trial_info = \
        { "depth": _depth,
         'response_time': _trial['_trial_rt'],
         'lsp_subj_id': _trial['_lsp_subj_id'] }
        res_coll_1 = _mongo_coll_1.find_one({ '_lsp_subj_id':_trial['_lsp_subj_id'] })
        _trial_info['img_id'] = res_coll_1['_lsp_img_id']
        _ass_trials.append(_trial_info)

    _polished_data['trials'] = _ass_trials

    return (_polished_data, _error, ass_data['_hit_reject_flag'], ass_data['_hit_flag']) 
Example 42
Source File: visualize_preprocessing.py    From kaggle-heart with MIT License 5 votes vote down vote up
def clean_image_data(imdata, metadata):
    """
    clean up 4d-tensor of imdata consistently (fix contrast, move upside up, etc...)
    :param imdata:
    :return:
    """

    # normalize contrast
    flat_data = np.concatenate([i.flatten() for i in imdata]).flatten()
    high = np.percentile(flat_data, 95.0)
    low  = np.percentile(flat_data, 5.0)
    print high,low
    for i in xrange(len(imdata)):
        image = imdata[i]
        image = 1.0 * (image - low) / (high - low)
        image = np.clip(image, 0.0, 1.0)
        imdata[i] = image

    return imdata 
Example 43
Source File: gas.py    From BNAF with MIT License 5 votes vote down vote up
def load_data_and_clean(file):

    data = load_data(file)
    B = get_correlation_numbers(data)

    while np.any(B > 1):
        col_to_remove = np.where(B > 1)[0][0]
        col_name = data.columns[col_to_remove]
        data.drop(col_name, axis=1, inplace=True)
        B = get_correlation_numbers(data)
    # print(data.corr())
    data = (data - data.mean()) / data.std()

    return data 
Example 44
Source File: gas.py    From BNAF with MIT License 5 votes vote down vote up
def load_data_and_clean_and_split(file):

    data = load_data_and_clean(file).as_matrix()
    N_test = int(0.1 * data.shape[0])
    data_test = data[-N_test:]
    data_train = data[0:-N_test]
    N_validate = int(0.1 * data_train.shape[0])
    data_validate = data_train[-N_validate:]
    data_train = data_train[0:-N_validate]

    return data_train, data_validate, data_test 
Example 45
Source File: utils.py    From pycrop-yield-prediction with MIT License 5 votes vote down vote up
def load_clean_yield_data(yield_data_filepath):
    """
    Cleans the yield data by making sure any Nan values in the columns we care about
    are removed
    """
    important_columns = ['Year', 'State ANSI', 'County ANSI', 'Value']
    yield_data = pd.read_csv(yield_data_filepath).dropna(subset=important_columns,
                                                         how='any')
    return yield_data 
Example 46
Source File: node.py    From Auto-PyTorch with Apache License 2.0 5 votes vote down vote up
def clean_fit_data(self):
        node = self
        
        # clear outputs
        while (node is not None):
            node.fit_output = None
            node.predict_output = None
            node = node.child_node 
Example 47
Source File: load_and_format_data.py    From CDSS with GNU General Public License v3.0 5 votes vote down vote up
def clean_raw_data(raw_data_dir, data_out_dir='/home/ec2-user/cs230/scripts/DeepLearning/RNN/data_final/',
                   out_prefix='rnn_data_', out_suffix='.txt', file_prefix='final_data_dropcols_', file_suffix='.txt',
                   nfiles=1):
    '''
    Cleans raw data from healthrex database pull

    Parameters:
        @raw_data_dir: (string) path to directory containing raw data files
        @data_out_dir: (string) path to directory for writing cleaned data
        @file_prefix: (string) prefix of raw data files
        @file_suffix: (string) suffix of raw data files
        @num_files: (int) how many files to load
    '''
    print("BEGIN CLEAN DATA...")

    for n in range(10, nfiles):
        fpath = raw_data_dir + file_prefix + str(n) + file_suffix
        print("Cleaning {}...".format(fpath))
        opath = data_out_dir + out_prefix + str(n) + out_suffix
        df_total = pd.read_table(fpath, sep='\t', dtype=None, header=2)
        df_total = replace_none_nan(df_total)
        df_total.to_csv(opath, sep='\t', header=True, index=False)

    print("CLEAN DATA COMPLETE")

    return 
Example 48
Source File: DewpointChart.py    From mvp with MIT License 5 votes vote down vote up
def cleanData(data, test=False):
    '''Flatten structure to three columns'''
    out=[]
    for row in data:
#        print row
        hold={}
        # bin the timestamp into 20 minute groups
        # get only the first 19 characters of the timestamp
        d=UTCStrToLDT(row["start_date"]["timestamp"])
        d=d.replace(second=0, minute=int(math.floor(d.minute/20)))
        hold['timestamp']=str(d)
        hold["name"]=row["subject"]["attribute"]["name"]
        hold["value"]=row["subject"]["attribute"]["value"]
        out.append(hold)
    return out 
Example 49
Source File: demand_measures.py    From EnergyPATHWAYS with MIT License 5 votes vote down vote up
def clean_data(self):
        if self.input_type == 'total':
            self.savings = self.clean_timeseries('values', inplace=False, time_index_name='year', time_index=self.years)
        else:
            self.remap(map_from='raw_values', map_to='values', converted_geography=GeoMapper.demand_primary_geography, time_index_name='year',lower=-100) 
Example 50
Source File: eval.py    From nutonomy_pointpillars with MIT License 5 votes vote down vote up
def clean_data(gt_anno, dt_anno, current_class, difficulty):
    CLASS_NAMES = ['car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer']
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    current_cls_name = CLASS_NAMES[current_class].lower()
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        bbox = gt_anno["bbox"][i]
        gt_name = gt_anno["name"][i].lower()
        height = bbox[3] - bbox[1]
        valid_class = -1
        if (gt_name == current_cls_name):
            valid_class = 1
        elif (current_cls_name == "Pedestrian".lower()
              and "Person_sitting".lower() == gt_name):
            valid_class = 0
        elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name):
            valid_class = 0
        else:
            valid_class = -1
        ignore = False
        if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
                or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
                or (height <= MIN_HEIGHT[difficulty])):
            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
            ignore = True
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        elif (valid_class == 0 or (ignore and (valid_class == 1))):
            ignored_gt.append(1)
        else:
            ignored_gt.append(-1)
    # for i in range(num_gt):
        if gt_anno["name"][i] == "DontCare":
            dc_bboxes.append(gt_anno["bbox"][i])
    for i in range(num_dt):
        if (dt_anno["name"][i].lower() == current_cls_name):
            valid_class = 1
        else:
            valid_class = -1
        height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
        if height < MIN_HEIGHT[difficulty]:
            ignored_dt.append(1)
        elif valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes 
Example 51
Source File: common.py    From xunfengES with GNU General Public License v3.0 5 votes vote down vote up
def cleanPostData(data):
    data = data.replace(" ","") # 去掉空格
    data = data.split("\n") # 去掉换行
    while "" in data:
        data.remove("")
    return ",".join(data) 
Example 52
Source File: fh_utils.py    From IBATS_HuobiFeeder_old with GNU General Public License v3.0 5 votes vote down vote up
def clean_datetime_remove_time_data(atime):
    """
    将时间对象的 时、分、秒 全部清零
    :param atime:
    :return:
    """
    return datetime(atime.year, atime.month, atime.day) 
Example 53
Source File: analyzer.py    From uwsgi-sloth with Apache License 2.0 5 votes vote down vote up
def clean_data_by_key(self, key):
        try:
            del self.data[key]
        except KeyError:
            pass 
Example 54
Source File: fMRI.py    From mmvt with GNU General Public License v3.0 5 votes vote down vote up
def clean_4d_data(args):
    '''
    python -m src.preproc.fMRI -s nmr00474,nmr00502,nmr00515,nmr00603,nmr00609,nmr00626,nmr00629,nmr00650,nmr00657,nmr00669,nmr00674,nmr00681,nmr00683,nmr00692,nmr00698,nmr00710
        -a laus125 -f clean_resting_state_data --template_brain fsaverage5 --fmri_file_template "f.nii*" --remote_subject_dir "/space/franklin/1/users/sx424/mem_flex/subjects/{subject}"'
    '''
    args = fmri.read_cmd_args(dict(
        subject=args.subject,
        atlas=args.atlas,
        function='clean_4d_data',
        fmri_file_template='rest.nii*',
        fsd='rest_linda'
        # template_brain='fsaverage5',
    ))
    pu.run_on_subjects(args, fmri.main) 
Example 55
Source File: backend.py    From neon with Apache License 2.0 5 votes vote down vote up
def clean_data(self, tensor, layer_mkl):
        """
        For MKL backends to clean mkl data (memory not freed)
        """
        return None