Python Code Examples for clean data

60 Python code examples are found related to "clean data". These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Project: py-ecomplexity   Author: cid-harvard   File: ComplexityData.py    License: MIT License 7 votes vote down vote up
def clean_data(self, val_errors_flag_input):
        """Clean data to remove non-numeric values, handle NA's and duplicates"""
        # Make sure values are numeric
        self.data.val = pd.to_numeric(
            self.data.val, errors=val_errors_flag_input)
        self.data.set_index(['time', 'loc', 'prod'], inplace=True)
        if self.data.val.isnull().values.any():
            warnings.warn('NaN value(s) present, coercing to zero(es)')
            self.data.val.fillna(0, inplace=True)

        # Remove duplicates
        dups = self.data.index.duplicated()
        if dups.sum() > 0:
            warnings.warn(
                'Duplicate values exist, keeping the first occurrence')
            self.data = self.data[~self.data.index.duplicated()] 
Example 2
Project: ssbio   Author: SBRG   File: atlas3.py    License: MIT License 6 votes vote down vote up
def clean_data(self, keep_features=None, remove_correlated_feats=True):
        self.features_df = self.features_df.astype(float).fillna(0)
        self.features_df = self.features_df.loc[(self.features_df > 0).any(axis=1)]

        if keep_features:
            self.features_df = self.features_df.loc[self.features_df.index.isin(keep_features)]

        if remove_correlated_feats:
            tmp = self.features_df.T

            # Remove columns with no variation
            nunique = tmp.apply(pd.Series.nunique)
            cols_to_drop = nunique[nunique == 1].index
            tmp.drop(cols_to_drop, axis=1, inplace=True)

            perc_spearman = scipy.stats.spearmanr(tmp)
            abs_corr = np.subtract(np.ones(shape=perc_spearman.correlation.shape),
                                   np.absolute(perc_spearman.correlation))
            np.fill_diagonal(abs_corr, 0)
            abs_corr_clean = np.maximum(abs_corr,
                                        abs_corr.transpose())  # some floating point mismatches, just make symmetric
            clustering = linkage(squareform(abs_corr_clean), method='average')
            clusters = fcluster(clustering, .1, criterion='distance')
            names = tmp.columns.tolist()
            names_to_cluster = list(zip(names, clusters))
            indices_to_keep = []
            ### Extract models closest to cluster centroids
            for x in range(1, len(set(clusters)) + 1):
                # Create mask from the list of assignments for extracting submatrix of the cluster
                mask = np.array([1 if i == x else 0 for i in clusters], dtype=bool)

                # Take the index of the column with the smallest sum of distances from the submatrix
                idx = np.argmin(sum(abs_corr_clean[:, mask][mask, :]))

                # Extract names of cluster elements from names_to_cluster
                sublist = [name for (name, cluster) in names_to_cluster if cluster == x]

                # Element closest to centroid
                centroid = sublist[idx]
                indices_to_keep.append(centroid)

            self.features_df = self.features_df.loc[self.features_df.index.isin(indices_to_keep)] 
Example 3
Project: aca   Author: geekinglcq   File: lsi_model.py    License: MIT License 6 votes vote down vote up
def clean_data( text):

    n_text = []
    text = text.strip()
    p_set = '. , ! : ? ` ` '.split()
    for i in range(len(text)):
        if text[i] not in p_set:
            n_text.append(text[i])
    text = ''.join(n_text)

    stop_list =set('a is are on from for and not to'.split())
    #stop_list =set('a is are on from for and not to that this there these \
    #               those have has been were I you me they can could be do . , : ! ? '.split())
    text = [word for word in text.lower().split() if word not in stop_list]
    #text = [stemmer.stem(t) for t in text]
    return text 
Example 4
Project: BoxOfficeMojo   Author: skozilla   File: movie.py    License: MIT License 6 votes vote down vote up
def clean_data(self):
        """Formats all the extracted data into the appropriate types"""

        for results in self.data["Weekly"]:
            utils.convert_financial_field(results, "Average Per Theatre")
            utils.convert_financial_field(results, "Gross")
            utils.convert_financial_field(results, "Gross To Date")
            utils.convert_percent_field(results, "Week Over Week Change")
            utils.convert_date_field(results, "Week")
            utils.convert_int_field(results, "Rank")
            utils.convert_int_field(results, "Theaters")
            utils.convert_int_field(results, "Theatre Change")
            utils.convert_int_field(results, "Week Number")

        for key, value in self.data.iteritems():
            if "Total Gross" in key or "." in key:
                self.data.pop(key)
                break
        utils.standardize_keys(self.data) 
Example 5
Project: titanic_machine_learning_example   Author: ageitgey   File: train_and_predict.py    License: MIT License 6 votes vote down vote up
def cleanData(data):
  # If fare data is missing, replace it with the average from that class
  data.Fare = data.Fare.map(lambda x: np.nan if x==0 else x)
  classmeans = data.pivot_table('Fare', rows='Pclass', aggfunc='mean')
  data.Fare = data[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

  # Turn names into a number representing titles
  data.Name = data.Name.map(lambda x: parseName(x))

  # Covert sex into a numberic value
  data.Sex = data.Sex.apply(lambda sex: 0 if sex == "male" else 1)

  return data


# Load training and test data sets, cleaning them in the process 
Example 6
Project: DeepLearning-IDS   Author: rambasnet   File: data_cleanup.py    License: MIT License 6 votes vote down vote up
def cleanAllData():
    # inputDataPath = os.path.join(
    #    os.path.dirname(os.path.realpath(__file__)), )
    inputDataPath = '../ProcessedTrafficData'
    outputDataPath = '../NewCleanedData'
    if (not os.path.exists(outputDataPath)):
        os.mkdir(outputDataPath)

    files = os.listdir(inputDataPath)
    for file in files:
        if file.startswith('.'):
            continue
        if os.path.isdir(file):
            continue
        outFile = os.path.join(outputDataPath, file)
        inputFile = os.path.join(inputDataPath, file)
        cleanData(inputFile, outFile) 
Example 7
Project: osbs-client   Author: containerbuildsystem   File: make_mock_json.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def clean_data(self, out_data):
        if isinstance(out_data, dict):
            cleaned_data = {}
            for key, data in out_data.items():
                cleaned_data[key] = self.clean_data(data)
            return cleaned_data
        elif isinstance(out_data, list):
            cleaned_data = []
            for data in out_data:
                cleaned_data.append(self.clean_data(data))
            return cleaned_data
        elif isinstance(out_data, str):
            if re.search(self.rh_pattern, out_data):
                return re.sub(self.ex_pattern, "example.com", out_data)
            else:
                return out_data
        else:
            return out_data 
Example 8
Project: olapy   Author: abilian   File: execute.py    License: GNU Lesser General Public License v2.1 6 votes vote down vote up
def clean_data(star_schema_df, measures):
        """measure like this: 1 349 is not numeric so we try to transform it to
        1349.

        :param star_schema_df: start schema dataframe
        :param measures: list of measures columns names

        :return: cleaned columns
        """
        if measures:
            for measure in measures:
                if star_schema_df[measure].dtype == object:
                    star_schema_df[measure] = star_schema_df[measure].str.replace(
                        " ", ""
                    )
                    try:
                        star_schema_df[measure] = star_schema_df[measure].astype(
                            "float"
                        )
                    except ValueError:
                        star_schema_df = star_schema_df.drop(measure, 1)
        return star_schema_df 
Example 9
Project: Phen2Gene   Author: WGLab   File: filter.py    License: MIT License 6 votes vote down vote up
def clean_term_data(HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment, subset,property_value):
    HPid = ""
    xref = []
    synonym = []
    is_a = []
    name = ""

    definition = ""
    is_obsolete = False
    replaced_by = []
    consider = []
    alt_id = []

    created_by = ""
    creation_date = ""
    comment = ""
    subset = ""
    property_value = ""

    return (HPid,xref,is_a,name,definition,is_obsolete,replaced_by,consider,alt_id,synonym,created_by, creation_date,comment,subset,property_value) 
Example 10
Project: FunpySpiderSearchEngine   Author: mtianyan   File: zhihu_item.py    License: Apache License 2.0 6 votes vote down vote up
def clean_data(self):
        try:
            self["praise_num"] = extract_num("".join(self["praise_num"]))
        except BaseException:
            self["praise_num"] = 0
        self["comments_num"] = extract_num("".join(self["comments_num"]))

        self["create_time"] = datetime.datetime.fromtimestamp(
            self["create_time"]).strftime(SQL_DATETIME_FORMAT)
        try:
            self["update_time"] = datetime.datetime.fromtimestamp(
                self["update_time"]).strftime(SQL_DATETIME_FORMAT)
        except:
            self["update_time"] = self["create_time"]

        self["crawl_time"] = self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        self["content"] = remove_tags(self["content"]) 
Example 11
Project: koku   Author: project-koku   File: report_db_accessor_base.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def clean_data(self, data, table_name):
        """Clean data for insertion into database.

        Args:
            data (dict): The data to be cleaned
            table_name (str): The table name the data is associated with

        Returns:
            (dict): The data with values converted to required types

        """
        column_types = self.report_schema.column_types[table_name]

        for key, value in data.items():
            if value is None or value == "":
                data[key] = None
                continue
            if column_types.get(key) == int or column_types.get(key) == "BigIntegerField":
                data[key] = self._convert_value(value, int)
            elif column_types.get(key) == float:
                data[key] = self._convert_value(value, float)
            elif column_types.get(key) == Decimal:
                data[key] = self._convert_value(value, Decimal)

        return data 
Example 12
Project: betterlifepsi   Author: betterlife   File: cli.py    License: MIT License 6 votes vote down vote up
def clean_transaction_data():
    """
    Clean all the transaction data, and keep all master data
    """
    # TODO.xqliu Disable clean of database for production
    from psi.app.service import Info
    database = Info.get_db()
    database.engine.execute("""
        DELETE FROM related_values;
        DELETE FROM inventory_in_out_link;
        DELETE FROM incoming;
        DELETE FROM shipping_line;
        DELETE FROM shipping;
        DELETE FROM expense;
        DELETE FROM receiving_line;
        DELETE FROM receiving;
        DELETE FROM purchase_order_line;
        DELETE FROM purchase_order;
        DELETE FROM sales_order_line;
        DELETE FROM sales_order;
        DELETE FROM inventory_transaction_line;
        DELETE FROM inventory_transaction;
        commit;
    """) 
Example 13
Project: DataExploration   Author: AllenDowney   File: marriage.py    License: MIT License 6 votes vote down vote up
def CleanData(resp):
    """Cleans a respondent DataFrame.

    resp: DataFrame of respondents

    Adds columns: agemarry, age, decade, fives
    """
    resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True)

    resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0
    resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0

    month0 = pd.to_datetime('1899-12-15')
    dates = [month0 + pd.DateOffset(months=cm) 
             for cm in resp.cmbirth]
    resp['year'] = (pd.DatetimeIndex(dates).year - 1900)
    resp['decade'] = resp.year // 10
    resp['fives'] = resp.year // 5 
Example 14
Project: darwin-sdk   Author: sparkcognition   File: sdk.py    License: Apache License 2.0 6 votes vote down vote up
def clean_data(self, dataset_name, **kwargs):
        url = self.server_url + self.routes['clean_data'] + urllib.parse.quote(dataset_name, safe='')
        headers = self.get_auth_header()
        parameters = kwargs
        if headers is None:
            return False, "Cannot get Auth token. Please log in."
        r = self.s.post(url, headers=headers, json=parameters)
        if not r.ok and 'Please run analyze data' in r.text:
            print("Raw profile not found. Running analyze_data")
            char_encoding = parameters['char_encoding'] if 'char_encoding' in parameters else 'utf-8'
            r = self.analyze_data(dataset_name, char_encoding=char_encoding)
            if r[0]:
                r = self.s.post(url, headers=headers, json=parameters)
            else:
                return r
        return self.get_return_info(r)

    # Create risk information for a datatset 
Example 15
Project: santander-product-recommendation-8th-place   Author: yaxen   File: clean.py    License: MIT License 6 votes vote down vote up
def clean_data(fi, fo, header, suffix):
    head = fi.readline().strip("\n").split(",")
    head = [h.strip('"') for h in head]
    for i, h in enumerate(head):
        if h == "nomprov":
            ip = i
    print(ip)
    n = len(head)
    if header:
        fo.write("%s\n" % ",".join(head))

    print(n)
    for line in fi:
        fields = line.strip("\n").split(",")
        if len(fields) > n:
            prov = fields[ip] + fields[ip+1]
            del fields[ip]
            fields[ip] = prov
        assert len(fields) == n
        fields = [field.strip() for field in fields]
        fo.write("%s%s\n" % (",".join(fields), suffix)) 
Example 16
Project: keras-punctuator   Author: vackosar   File: punctuator.py    License: MIT License 6 votes vote down vote up
def cleanData(inputFile):
    sys.stderr.write("Cleaning data " + inputFile + "\n")
    mappings = OrderedDict([
        (re.compile("['’]"), "'"),
        # (re.compile("' s([" + DOT_LIKE_AND_SPACE + "])"), "'s\g<1>"), # Removes strange text mistake pattern in europarl data.
        (re.compile("n't"), " n't"),
        #(re.compile(" '([^" + DOT_LIKE + "']*)'"), '. \g<1>.'), # Remove quoting apostrophes.
        (re.compile("'([^t])"), " '\g<1>"), # Separate tokens like "'s" "'ll" and so on.
        #(re.compile('\([^)]*\)'), ''), # Removes bracketed.
        (re.compile('[-—]'), ' '), # Dash to space.
        (re.compile('[^a-z0-9A-Z\',\.?! ]'), ' '), # Other unknown to space.
        # (re.compile('^$|^\.$'), ''), # Removes empty line.
    ])
    cleanFile = inputFile + '.clean'
    regexProcess(mappings, inputFile, cleanFile)
    return cleanFile 
Example 17
Project: DataProcessing_Python   Author: vopani   File: data_cleaning.py    License: MIT License 6 votes vote down vote up
def clean_data(self):
        """Performs standard data cleaning functions
        """

        self.extract_ids()
        self.extract_target()

        self.check_column_names()
        
        self.remove_constant_variables()
        self.convert_columns_to_binary()

        self.check_date_variables()
        self.check_categorical_variables()
        self.encode_categories()

        print("Data is clean and ready!\n")


    ## function for removing columns 
Example 18
Project: python-cqhttp   Author: nonebot   File: cqhttp_helper.py    License: MIT License 6 votes vote down vote up
def clean_data_dir(self, *, data_dir):
        """
        清理数据目录

        ------------

        :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录,如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir') \
            (data_dir=data_dir) 
Example 19
Project: libhxl-python   Author: HXLStandard   File: model.py    License: The Unlicense 6 votes vote down vote up
def clean_data(
            self, whitespace=[], upper=[], lower=[], date=[], date_format=None,
            number=[], number_format=None, latlon=[], purge=False, queries=[]
    ):
        """Clean data fields."""
        import hxl.filters
        return hxl.filters.CleanDataFilter(
            self,
            whitespace=whitespace,
            upper=upper,
            lower=lower,
            date=date, date_format=date_format,
            number=number, number_format=number_format,
            latlon=latlon,
            purge=purge,
            queries=queries
        ) 
Example 20
Project: ZeroShotVideoClassification   Author: bbrattoli   File: auxiliary_dataset.py    License: Apache License 2.0 6 votes vote down vote up
def clean_data(fnames, labels):
        if not isinstance(fnames[0], str):
            print('Cannot check for broken videos')
            return fnames, labels
        broken_videos_file = 'assets/kinetics_broken_videos.txt'
        if not os.path.exists(broken_videos_file):
            print('Broken video list does not exists')
            return fnames, labels

        t = time()
        with open(broken_videos_file, 'r') as f:
            broken_samples = [r[:-1] for r in f.readlines()]
        data = [x[75:] for x in fnames]
        keep_sample = np.in1d(data, broken_samples) == False
        fnames = np.array(fnames)[keep_sample]
        labels = np.array(labels)[keep_sample]
        print('Broken videos %.2f%% - removing took %.2f' % (100 * (1.0 - keep_sample.mean()), time() - t))
        return fnames, labels 
Example 21
Project: Short-Text-Summarization   Author: yangzhiye   File: get_data.py    License: Apache License 2.0 6 votes vote down vote up
def get_clean_data(filepath):
	# use re to delete useless information in data
	list_summary , list_short_text = parser_txt_to_data(filepath)

	def _remove_special_char(m):
		s = m.group(0)
		if s in u',。!?;“”:《》':
			return s
		return ''

	for i,line in enumerate(list_summary):
		line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line)
		list_summary[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	for i,line in enumerate(list_short_text):
		line = re.sub(u'[\(\[(#「【\)\])#」】]', '', line)
		list_short_text[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]', _remove_special_char, line).encode('utf-8')
	
	#print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
	return list_summary , list_short_text 
Example 22
Project: apex-sigma-core   Author: lu-ci   File: kanji.py    License: GNU General Public License v3.0 6 votes vote down vote up
def clean_readings_data(kanji_dict):
    """

    :param kanji_dict:
    :type kanji_dict:
    :return:
    :rtype:
    """
    readings = kanji_dict['readings']
    bad_chars = ['、 ', '、', '\t', ' ']
    rds = {'readings': {'kun': [], 'on': [], 'names': []}}
    for r_type in readings:
        for item in readings[r_type]:
            if item not in bad_chars:
                for char in bad_chars:
                    if char in item:
                        item = item.replace(char, '')
                rds['readings'][r_type].append(item)
    return rds 
Example 23
Project: Pirates-Online-Rewritten   Author: PiratesOnlineRewritten   File: LegendaryFish.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def cleanFishData(self):
        self.staminaValue = self.myData['stamina']
        self.fishStaminaBar['value'] = self.staminaValue
        self.fishStaminaValueLabel.setText(str(int(self.staminaValue)) + ' / ' + str(100))
        self.fishStaminaBar['barColor'] = FishingGlobals.fishingStaminaBarColor[int(self.staminaValue / 100.0 * (len(FishingGlobals.fishingStaminaBarColor) - 1))]
        self.hideStaminaBar()
        taskMgr.remove('updateFishStaminaTask')
        self.lurePosition = None
        self.fishChaseLureSequence.pause()
        self.fishChaseLureSequence.clearToInitial()
        self.lfStruggleSequence.pause()
        self.lfStruggleSequence.clearToInitial()
        if self.aboutToBitingInterval is None:
            return
        self.aboutToBitingInterval.pause()
        return 
Example 24
Project: python-cqhttp   Author: nonebot   File: cqhttp_helper.py    License: MIT License 6 votes vote down vote up
def clean_data_dir_async(self, *, data_dir):
        """
        清理数据目录 (异步版本)

        ------------

        :param str data_dir: 收到清理的目录名,支持 `image`、`record`、`show`、`bface`
        :return: None
        :rtype: None

        ------------

        用于清理积攒了太多旧文件的数据目录,如 `image`。

        HTTP API v3.3.4 新增
        """
        return super().__getattr__('clean_data_dir_async') \
            (data_dir=data_dir) 
Example 25
Project: invana-bot   Author: invanalabs   File: selectors.py    License: MIT License 6 votes vote down vote up
def clean_data(elements=None, selector=None):
    """

    This is where are the extracted data will be cleaned up and applied functions and data types as needed.

    :param elements:
    :param selector:
    :return:
    """
    data_type = selector.get("data_type", "RawField")

    if data_type.startswith("List"):
        multiple = True
    else:
        multiple = False

    data_extractor = SelectorExtractor()
    if multiple is True:
        extracted_data = data_extractor.get_list_data(elements=elements)
    else:
        extracted_data = data_extractor.get_single_data(elements=elements)
    data = transform_data(data=extracted_data, data_type=data_type)
    return data 
Example 26
Project: sanpy   Author: santiment   File: event_study.py    License: MIT License 6 votes vote down vote up
def clean_data(data, events, starting_point):
    """
    Cleans signals that does not have enough pricing data
    """
    events_df = events.copy(deep=True)
    events_df['in_pricesdf'] = 0
    id = 0

    for date, row in events_df.iterrows():
        sid = row.symbol
        if date not in data.index or sid not in data.columns:
            events_df.iloc[id, -1] = 1
            id = id+1
            continue
        event_day = data.index.searchsorted(date)
        hist_index_start = event_day - starting_point
        hist_index_end = event_day + starting_point
        event_window = data.iloc[hist_index_start:hist_index_end][[sid]]
        if event_window.min()[0] == 0 or len(event_window) == 0:
            events_df.iloc[id, -1] = 1
        id = id+1
    return events_df[events_df['in_pricesdf'] == 0] 
Example 27
Project: carebot   Author: thecarebot   File: jquery_scrolldepth.py    License: MIT License 6 votes vote down vote up
def clean_data(self, data):
        """
        Fix data types, truncate the data, and otherwise make it fit for
        consumption.
        """
        rows = []
        for row in data:
            row[0] = int(row[0]) # Percent depth on page
            row[1] = int(row[1]) # Total users
            row[2] = int(row[2]) # Seconds on page
            rows.append(row)

        # Sort the row data from 10% => 100%
        rows.sort(key=lambda tup: tup[0])

        rows = self.fill_in_max(rows)

        # Only take the first 10 rows.
        truncated = rows[:10]
        return truncated 
Example 28
Project: oh-my-rss   Author: richshaw2015   File: cron.py    License: MIT License 6 votes vote down vote up
def clean_history_data():
    """
    清除历史数据
    :return:
    """
    logger.info('开始清理历史数据')

    lastweek = datetime.now() - timedelta(days=7)
    last3month = datetime.now() - timedelta(days=90)
    lastyear = datetime.now() - timedelta(days=365)

    # (, 10),直接删除
    Article.objects.filter(site__star__lt=10, ctime__lte=lastweek).delete()

    # [10, 20),创建时间超过 3 个月,内容置空
    Article.objects.filter(site__star__gte=10, site__star__lt=20, ctime__lte=last3month).update(content=' ')

    # [20, ),创建时间超过一年,内容置空
    Article.objects.filter(site__star__gte=20, ctime__lte=lastyear).update(content=' ')

    # 压缩数据库
    vacuum_sqlite_db()

    logger.info('历史数据清理完毕') 
Example 29
Project: yass   Author: paninski-lab   File: cluster.py    License: Apache License 2.0 6 votes vote down vote up
def clean_input_data(self):
        # limit clustering to at most 50,000 spikes
        max_spikes = self.CONFIG.cluster.max_n_spikes
        if len(self.spike_times_original)>max_spikes:
            idx_sampled = np.random.choice(
                a=np.arange(len(self.spike_times_original)),
                size=max_spikes,
                replace=False)
            self.spike_times_original = self.spike_times_original[idx_sampled]
        else:
            idx_sampled = np.arange(len(self.spike_times_original))

        # limit indexes away from edge of recording
        idx_inbounds = np.where(np.logical_and(
                        self.spike_times_original>=self.spike_size//2,
                        self.spike_times_original<(self.reader_raw.rec_len-self.spike_size)))[0]
        self.spike_times_original = self.spike_times_original[
            idx_inbounds].astype('int32')

        # clean upsampled ids if available
        if not self.raw_data:
            self.template_ids_in = self.template_ids_in[
                idx_sampled][idx_inbounds].astype('int32') 
Example 30
Project: webspider   Author: JustForFunnnn   File: lagou_companies.py    License: MIT License 6 votes vote down vote up
def clean_lg_company_data(company_dict):
    """
    清洗爬取到的公司信息

    :param company_dict: tornado.util.ObjectDict
    """
    if 'size' in company_dict:
        company_dict.size = company_dict.size.strip()
    if 'finance_stage' in company_dict:
        company_dict.finance_stage = company_dict.finance_stage.strip()
    if 'features' in company_dict:
        company_dict.features = utils.text.to_plaintext(company_dict.features)
    if 'address' in company_dict:
        company_dict.address = utils.text.to_plaintext(company_dict.address)
    if 'introduce' in company_dict:
        company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else ''
        company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN]
    if 'advantage' in company_dict:
        company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage))
        company_dict.advantage = json.dumps(company_dict.advantage)[
            :constants.COMPANY_ADVANTAGE_MAX_LEN]
    if 'industries' in company_dict:
        company_dict.industries = set(re.split(r",|,|、|\s", company_dict.industries)) 
Example 31
Project: yoda   Author: yoda-pa   File: util.py    License: MIT License 5 votes vote down vote up
def clean_soup_data(data):
    data = str(data)
    cleaner = re.compile("<.*?>")
    data = re.sub(cleaner, "", data)
    return data.replace(":", "").strip() 
Example 32
Project: KubeOperator   Author: KubeOperator   File: inventory.py    License: Apache License 2.0 5 votes vote down vote up
def clean_hosts_data(self, initial_data):
        __hosts = initial_data.get('hosts')
        cleaned_hosts = []

        for host in __hosts:
            groups = host.pop('groups', [])
            self.hosts_groups_map[host['name']] = groups
            cleaned_hosts.append(host)
        return cleaned_hosts 
Example 33
Project: pushwatch   Author: amyth   File: forms.py    License: MIT License 5 votes vote down vote up
def clean_gcm_data(self):
        gcm_data = self.cleaned_data.get('gcm_data')
        if gcm_data:
            try:
                json.loads(gcm_data)
            except Exception:
                raise forms.ValidationError(
                        'Data you entered does not seem to be valid json.')
        return gcm_data 
Example 34
Project: MachineLearningNote   Author: AlanConstantine   File: SinglePass.py    License: MIT License 5 votes vote down vote up
def clean_data(data):
    new_data = {}
    id = 0
    for datum in data:
        id += 1
        datum_list = datum.strip().split()
        new_data[id] = datum_list
    return new_data 
Example 35
Project: Osmedeus   Author: j3ssie   File: clean.py    License: MIT License 5 votes vote down vote up
def clean_data_tables(workspace):
    utils.print_info('Clean Data Tables')
    if workspace:
        Activities.objects.filter(workspace=workspace).delete()
        Workspaces.objects.filter(workspace=workspace).delete()
        Summaries.objects.filter(workspace=workspace).delete()
        Reports.objects.filter(workspace=workspace).delete()
    else:
        Activities.objects.all().delete()
        Workspaces.objects.all().delete()
        Summaries.objects.all().delete()
        Reports.objects.all().delete() 
Example 36
Project: nutonomy_pointpillars   Author: SmallMunich   File: eval.py    License: MIT License 5 votes vote down vote up
def clean_data(gt_anno, dt_anno, current_class, difficulty):
    CLASS_NAMES = ['car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer']
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    current_cls_name = CLASS_NAMES[current_class].lower()
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        bbox = gt_anno["bbox"][i]
        gt_name = gt_anno["name"][i].lower()
        height = bbox[3] - bbox[1]
        valid_class = -1
        if (gt_name == current_cls_name):
            valid_class = 1
        elif (current_cls_name == "Pedestrian".lower()
              and "Person_sitting".lower() == gt_name):
            valid_class = 0
        elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name):
            valid_class = 0
        else:
            valid_class = -1
        ignore = False
        if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
                or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
                or (height <= MIN_HEIGHT[difficulty])):
            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
            ignore = True
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        elif (valid_class == 0 or (ignore and (valid_class == 1))):
            ignored_gt.append(1)
        else:
            ignored_gt.append(-1)
    # for i in range(num_gt):
        if gt_anno["name"][i] == "DontCare":
            dc_bboxes.append(gt_anno["bbox"][i])
    for i in range(num_dt):
        if (dt_anno["name"][i].lower() == current_cls_name):
            valid_class = 1
        else:
            valid_class = -1
        height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
        if height < MIN_HEIGHT[difficulty]:
            ignored_dt.append(1)
        elif valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes 
Example 37
Project: Sarcasm-Detection   Author: MirunaPislar   File: data_processing.py    License: MIT License 5 votes vote down vote up
def get_filtered_clean_data(train_filename, test_filename):
    # Loading the train and test sets
    print("Loading data...")
    train_tokens = utils.load_file(path + "/res/data/" + train_filename)
    test_tokens = utils.load_file(path + "/res/data/" + test_filename)
    filtered_train_tokens = ulterior_clean(train_tokens, path + "/res/data/filtered_" + train_filename)
    filtered_test_tokens = ulterior_clean(test_tokens, path + "/res/data/filtered_" + test_filename)
    return filtered_train_tokens, filtered_test_tokens


# Grammatical clean of data (designed to be applied on top of initial clean - e.g. train.txt) 
Example 38
Project: swarfarm   Author: PeteAndersen   File: forms.py    License: Apache License 2.0 5 votes vote down vote up
def clean_json_data(self):
        import json

        data = self.cleaned_data['json_data']

        try:
            data = json.loads(data)
        except:
            raise forms.ValidationError("Error parsing JSON data.")

        return data 
Example 39
Project: pytorch-flows   Author: ikostrikov   File: gas.py    License: MIT License 5 votes vote down vote up
def load_data_and_clean_and_split(file):

    data = load_data_and_clean(file).as_matrix()
    N_test = int(0.1 * data.shape[0])
    data_test = data[-N_test:]
    data_train = data[0:-N_test]
    N_validate = int(0.1 * data_train.shape[0])
    data_validate = data_train[-N_validate:]
    data_train = data_train[0:-N_validate]

    return data_train, data_validate, data_test 
Example 40
Project: pvcnn   Author: mit-han-lab   File: eval.py    License: MIT License 5 votes vote down vote up
def clean_data(gt_anno, dt_anno, current_class, difficulty):
    _class_names = ['car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car', 'tractor', 'trailer']
    _min_height = [40, 25, 25]
    _max_occlusion = [0, 1, 2]
    _max_truncation = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    current_cls_name = _class_names[current_class].lower()
    num_gt = len(gt_anno['name'])
    num_dt = len(dt_anno['name'])
    num_valid_gt = 0
    for i in range(num_gt):
        bbox = gt_anno['bbox'][i]
        gt_name = gt_anno['name'][i].lower()
        height = bbox[3] - bbox[1]
        if gt_name == current_cls_name:
            valid_class = 1
        elif current_cls_name == 'Pedestrian'.lower() and 'Person_sitting'.lower() == gt_name:
            valid_class = 0
        elif current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name:
            valid_class = 0
        else:
            valid_class = -1
        ignore = False
        if ((gt_anno['occluded'][i] > _max_occlusion[difficulty])
                or (gt_anno['truncated'][i] > _max_truncation[difficulty])
                or (height <= _min_height[difficulty])):
            ignore = True
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        elif valid_class == 0 or (ignore and (valid_class == 1)):
            ignored_gt.append(1)
        else:
            ignored_gt.append(-1)
        if gt_anno['name'][i] == 'DontCare':
            dc_bboxes.append(gt_anno['bbox'][i])
    for i in range(num_dt):
        if dt_anno['name'][i].lower() == current_cls_name:
            valid_class = 1
        else:
            valid_class = -1
        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
        if height < _min_height[difficulty]:
            ignored_dt.append(1)
        elif valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes 
Example 41
Project: zulip   Author: zulip   File: retention.py    License: Apache License 2.0 5 votes vote down vote up
def clean_archived_data() -> None:
    logger.info("Cleaning old archive data.")
    check_date = timezone_now() - timedelta(days=settings.ARCHIVED_DATA_VACUUMING_DELAY_DAYS)
    # Associated archived objects will get deleted through the on_delete=CASCADE property:
    count = 0
    transaction_ids = list(ArchiveTransaction.objects.filter(
        timestamp__lt=check_date).values_list("id", flat=True))
    while len(transaction_ids) > 0:
        transaction_block = transaction_ids[0:TRANSACTION_DELETION_BATCH_SIZE]
        transaction_ids = transaction_ids[TRANSACTION_DELETION_BATCH_SIZE:]
        ArchiveTransaction.objects.filter(id__in=transaction_block).delete()
        count += len(transaction_block)

    logger.info("Deleted %s old ArchiveTransactions.", count) 
Example 42
Project: pytorch-flows   Author: ikostrikov   File: gas.py    License: MIT License 5 votes vote down vote up
def load_data_and_clean(file):

    data = load_data(file)
    B = get_correlation_numbers(data)

    while np.any(B > 1):
        col_to_remove = np.where(B > 1)[0][0]
        col_name = data.columns[col_to_remove]
        data.drop(col_name, axis=1, inplace=True)
        B = get_correlation_numbers(data)
    # print(data.corr())
    data = (data - data.mean()) / data.std()

    return data 
Example 43
Project: zou   Author: cgwire   File: cli.py    License: GNU Affero General Public License v3.0 5 votes vote down vote up
def clean_tasks_data(projectid):
    """
    Reset task models data (retake count, wip start date and end date)
    """
    if projectid is not None:
        commands.reset_tasks_data(projectid) 
Example 44
Project: CloudBot   Author: TotallyNotRobots   File: chan_track.py    License: GNU General Public License v3.0 5 votes vote down vote up
def clean_user_data(user):
    """
    :type user: User
    """
    for memb in user.channels.values():
        memb.sort_status() 
Example 45
Project: piston-lib   Author: xeroc   File: storage.py    License: MIT License 5 votes vote down vote up
def clean_data(self):
        """ Delete files older than 70 days
        """
        log.info("Cleaning up old backups")
        for filename in os.listdir(self.data_dir):
            backup_file = os.path.join(self.data_dir, filename)
            if os.stat(backup_file).st_ctime < (time.time() - 70 * 86400):
                if os.path.isfile(backup_file):
                    os.remove(backup_file)
                    log.info("Deleting {}...".format(backup_file)) 
Example 46
Project: kitti-object-eval-python   Author: traveller59   File: eval.py    License: MIT License 5 votes vote down vote up
def clean_data(gt_anno, dt_anno, current_class, difficulty):
    CLASS_NAMES = [
        'car', 'pedestrian', 'cyclist', 'van', 'person_sitting', 'car',
        'tractor', 'trailer'
    ]
    MIN_HEIGHT = [40, 25, 25]
    MAX_OCCLUSION = [0, 1, 2]
    MAX_TRUNCATION = [0.15, 0.3, 0.5]
    dc_bboxes, ignored_gt, ignored_dt = [], [], []
    current_cls_name = CLASS_NAMES[current_class].lower()
    num_gt = len(gt_anno["name"])
    num_dt = len(dt_anno["name"])
    num_valid_gt = 0
    for i in range(num_gt):
        bbox = gt_anno["bbox"][i]
        gt_name = gt_anno["name"][i].lower()
        height = bbox[3] - bbox[1]
        valid_class = -1
        if (gt_name == current_cls_name):
            valid_class = 1
        elif (current_cls_name == "Pedestrian".lower()
              and "Person_sitting".lower() == gt_name):
            valid_class = 0
        elif (current_cls_name == "Car".lower() and "Van".lower() == gt_name):
            valid_class = 0
        else:
            valid_class = -1
        ignore = False
        if ((gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty])
                or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty])
                or (height <= MIN_HEIGHT[difficulty])):
            # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1:
            ignore = True
        if valid_class == 1 and not ignore:
            ignored_gt.append(0)
            num_valid_gt += 1
        elif (valid_class == 0 or (ignore and (valid_class == 1))):
            ignored_gt.append(1)
        else:
            ignored_gt.append(-1)
    # for i in range(num_gt):
        if gt_anno["name"][i] == "DontCare":
            dc_bboxes.append(gt_anno["bbox"][i])
    for i in range(num_dt):
        if (dt_anno["name"][i].lower() == current_cls_name):
            valid_class = 1
        else:
            valid_class = -1
        height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1])
        if height < MIN_HEIGHT[difficulty]:
            ignored_dt.append(1)
        elif valid_class == 1:
            ignored_dt.append(0)
        else:
            ignored_dt.append(-1)

    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes 
Example 47
Project: PVGeo   Author: OpenGeoVis   File: readers.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def clean_data_name(data_name, filename):
    """A helper to clean a filename to make a useful data array name"""
    if data_name is None or data_name == '':
        data_name = os.path.splitext(os.path.basename(filename))[0]
    return data_name 
Example 48
Project: Python-Journey-from-Novice-to-Expert   Author: PacktPublishing   File: data.py    License: MIT License 5 votes vote down vote up
def get_clean_data(source):

    data = load_data(source)
    cleaned_data = clean_data(data)

    return cleaned_data 
Example 49
Project: explorer   Author: blockcypher   File: forms.py    License: Apache License 2.0 5 votes vote down vote up
def clean_data_to_embed(self):
        data_to_embed = self.cleaned_data['data_to_embed'].strip()
        if not data_to_embed:
            err_msg = _('Data to embed cannot be blank')
            raise forms.ValidationError(err_msg)
        return data_to_embed 
Example 50
Project: Pair-Trading-Reinforcement-Learning   Author: wai-i   File: Cointegration.py    License: MIT License 5 votes vote down vote up
def clean_data(cls, x, y, on, col_name):
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        y.replace([np.inf, -np.inf], np.nan, inplace=True)
        merged_df = pd.merge(left=x, right=y, on=on, how='outer')
        clean_df  = merged_df.loc[merged_df.notnull().all(axis=1), :]
        df_x = pd.DataFrame()
        df_y = pd.DataFrame()
        df_x[on] = clean_df[on].values
        df_y[on] = clean_df[on].values
        df_x[col_name] = clean_df[col_name + '_x'].values
        df_y[col_name] = clean_df[col_name + '_y'].values
        return df_x, df_y 
Example 51
Project: DigiSparkStealer   Author: kovinevmv   File: main.py    License: MIT License 5 votes vote down vote up
def clean_collected_data(self):
        for f in self.files:
            try:
                if os.path.exists(f):
                    os.remove(f)
            except:
                pass 
Example 52
Project: mvp   Author: futureag   File: DewpointChart.py    License: MIT License 5 votes vote down vote up
def cleanData(data, test=False):
    '''Flatten structure to three columns'''
    out=[]
    for row in data:
#        print row
        hold={}
        # bin the timestamp into 20 minute groups
        # get only the first 19 characters of the timestamp
        d=UTCStrToLDT(row["start_date"]["timestamp"])
        d=d.replace(second=0, minute=int(math.floor(d.minute/20)))
        hold['timestamp']=str(d)
        hold["name"]=row["subject"]["attribute"]["name"]
        hold["value"]=row["subject"]["attribute"]["value"]
        out.append(hold)
    return out 
Example 53
Project: MultipleFactorRiskModel   Author: icezerowjj   File: own_tech.py    License: MIT License 5 votes vote down vote up
def clean_data(file_name, index_list):
    '''
    从文件读取数据并清理
    :param string file_name: xlsx文件路径
    :param [string] index_list: 原始指标名列表
    :return: [{string:DataFrame},DataFrame] [factor_data,ret]: 所用的每个指标的数据,各自放在一个DataFrame中,
    每个DataFrame的[i,j]元素是在第(i+1)天第(j+1)只股票在这个指标上的值.并且用相同的方法对ret进行裁剪,以便回归
    '''
    # data:all pure data from file
    data = load_file(file_name, index_list)
    # close:close value as factor
    close = data['close']
    # trade:trade value as factor
    trade = data['trade']
    # ret:return value as factor
    ret = getReturn(data['close'])
    # vol:return volatility as factor
    vol = getVol(ret)
    # KDJ:KDJ value as factor
    [RSV, K, D, KDJ] = getKDJ(close, data['high'], data['low'])
    # ema:EMA value as factor
    EMA = getEMA(close)
    # buy_signal:buy or not?It's a signal,as factor
    buy_signal = getBuySignal(EMA, trade)
    # sell_signal:another signal,as factor
    sell_signal = getSellSignal(EMA, trade)
    # rsi:RSI value as factor
    RSI = getRSI(close)
    # mtm:mtm value as factor
    MTM = getMTM(close)
    ev = data['ev']
    # w William index
    w = getWilliam(close, data['high'], data['low'])
    # 将计算出来的指标存入字典,并找出其最小行数
    unpruned_factor_data = {'KDJ': KDJ, 'EMA': EMA, 'vol': vol, 'MTM': MTM, 'buy_signal': buy_signal,
                            'sell_signal': sell_signal, 'trade': trade, 'RSI': RSI, 'ev': ev, 'William': w}
    [close, data, ret] = pre_processing(close, unpruned_factor_data, ret)
    for i in data.items():
        data[i[0]]=pd.DataFrame(scale(i[1]))
    return [close, data, ret] 
Example 54
Project: UMNN   Author: AWehenkel   File: gas.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_data_and_clean_and_split(file):

    data = load_data_and_clean(file).as_matrix()
    N_test = int(0.1 * data.shape[0])
    data_test = data[-N_test:]
    data_train = data[0:-N_test]
    N_validate = int(0.1 * data_train.shape[0])
    data_validate = data_train[-N_validate:]
    data_train = data_train[0:-N_validate]

    return data_train, data_validate, data_test 
Example 55
Project: c3nav   Author: c3nav   File: models.py    License: Apache License 2.0 5 votes vote down vote up
def clean_data(self):
        new_data = OrderedDict()
        for name, field in self.get_fields().items():
            value = self.data.get(name)
            if value is None or value not in dict(field.choices):
                value = field.initial
            new_data[name] = value
        self.data = new_data 
Example 56
Project: contiguous-succotash   Author: kefirski   File: batch_loader.py    License: MIT License 5 votes vote down vote up
def clean_whole_data(self, string):
        string = re.sub('^[\d\:]+ ', '', string, 0, re.M)
        string = re.sub('\n\s{11}', ' ', string, 0, re.M)
        string = re.sub('\n{2}', '\n', string, 0, re.M)

        return string.lower() 
Example 57
Project: CloudBot   Author: TotallyNotRobots   File: chan_track.py    License: GNU General Public License v3.0 5 votes vote down vote up
def clean_conn_data(conn):
    """
    :type conn: cloudbot.client.Client
    """
    for user in get_users(conn).values():
        clean_user_data(user)

    for chan in get_chans(conn).values():
        clean_chan_data(chan) 
Example 58
Project: kaggle-heart   Author: 317070   File: visualize_preprocessing.py    License: MIT License 5 votes vote down vote up
def clean_image_data(imdata, metadata):
    """
    clean up 4d-tensor of imdata consistently (fix contrast, move upside up, etc...)
    :param imdata:
    :return:
    """

    # normalize contrast
    flat_data = np.concatenate([i.flatten() for i in imdata]).flatten()
    high = np.percentile(flat_data, 95.0)
    low  = np.percentile(flat_data, 5.0)
    print high,low
    for i in xrange(len(imdata)):
        image = imdata[i]
        image = 1.0 * (image - low) / (high - low)
        image = np.clip(image, 0.0, 1.0)
        imdata[i] = image

    return imdata 
Example 59
Project: notifiers   Author: liiight   File: dynamic_click.py    License: MIT License 5 votes vote down vote up
def clean_data(data: dict) -> dict:
    """Removes all empty values and converts tuples into lists"""
    new_data = {}
    for key, value in data.items():
        # Verify that only explicitly passed args get passed on
        if not isinstance(value, bool) and not value:
            continue

        # Multiple choice command are passed as tuples, convert to list to match schema
        if isinstance(value, tuple):
            value = list(value)
        new_data[key] = value
    return new_data 
Example 60
Project: QQZoneMood   Author: Maicius   File: QQZoneAnalysis.py    License: MIT License 5 votes vote down vote up
def clean_label_data():
    new_list = ['maicius']
    for name in new_list:
        print(name + '====================')
        analysis = QQZoneAnalysis(use_redis=True, debug=True, username=name, analysis_friend=False)
        # print(analysis.check_data_shape())
        analysis.get_useful_info_from_json()
        analysis.save_data_to_csv()
        # analysis.save_data_to_excel()
        # analysis.export_label_data(analysis.mood_data_df)
        # analysis.draw_content_cloud(analysis.mood_data_df)
        # analysis.draw_cmt_cloud(analysis.mood_data_df)
        analysis.draw_like_cloud(analysis.mood_data_df)
        # analysis.export_all_label_data()