Python config.LOG_DIR Examples

The following are 30 code examples of config.LOG_DIR(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module config , or try the search function .
Example #1
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go() 
Example #2
Source File: test.py    From PINTO_model_zoo with MIT License 6 votes vote down vote up
def __init__(self, test_weight):
        log_dir = os.path.join(cfg.LOG_DIR, 'test')
        test_weight_path = os.path.join(cfg.WEIGHTS_DIR, test_weight)

        with tf.name_scope('input'):
            input_data = tf.placeholder(dtype=tf.float32, name='input_data')
            training = tf.placeholder(dtype=tf.bool, name='training')
        _, _, _, pred_sbbox, pred_mbbox, pred_lbbox = YOLOV3(training).build_nework(input_data)
        with tf.name_scope('summary'):
            tf.summary.FileWriter(log_dir).add_graph(tf.get_default_graph())
        self.__sess = tf.Session()
        net_vars = tf.get_collection('YoloV3')
        saver = tf.train.Saver(net_vars)
        saver.restore(self.__sess, test_weight_path)
        super(Yolo_test, self).__init__(self.__sess, input_data, training, pred_sbbox, pred_mbbox, pred_lbbox)
        print("input_data.name=", input_data.name)
        print("pred_sbbox=", pred_sbbox.name)
        print("pred_mbbox=", pred_mbbox.name)
        print("pred_lbbox=", pred_lbbox.name) 
Example #3
Source File: eval.py    From NFETC with MIT License 6 votes vote down vote up
def main(options):
    if options.epoch:
        time_str = datetime.datetime.now().isoformat()
        logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (options.model_name,
                options.data_name, time_str)
        logger = logging_utils._get_logger(config.LOG_DIR, logname)
    else:
        time_str = datetime.datetime.now().isoformat()
        logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (options.model_name,
                options.data_name, time_str)
        logger = logging_utils._get_logger(config.LOG_DIR, logname)
        # logger = logging.getLogger()
        # logging.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=logging.INFO)
    params_dict = param_space_dict[options.model_name]
    task = Task(options.model_name, options.data_name, options.runs, params_dict, logger)
    if options.save:
        task.save()
    else:
        if options.epoch:
            task.refit()
        else:
            task.evaluate(options.full) 
Example #4
Source File: feature_distance.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(CompressionDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go() 
Example #5
Source File: feature_distance.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go() 
Example #6
Source File: feature_combiner.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def __init__(self, feature_list, feature_name, feature_suffix=".csv",
                feature_level=2, meta_feature_dict={}, corr_threshold=0):
        self.feature_name = feature_name
        self.feature_list = feature_list
        self.feature_suffix = feature_suffix
        self.feature_level = feature_level
        # for meta features
        self.meta_feature_dict = meta_feature_dict
        self.corr_threshold = corr_threshold
        self.feature_names_basic = []
        self.feature_names_cv = []
        self.feature_names = []
        self.has_basic = 1 if self.meta_feature_dict else 0
        logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        if self.feature_level == 2:
            self.splitter = splitter_level2
        elif self.feature_level == 3:
            self.splitter = splitter_level3
        self.n_iter = n_iter
        self.splitter_prev = [0]*self.n_iter 
Example #7
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_lsa_ngram_cosinesim():
    logname = "generate_feature_lsa_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_CosineSim, LSA_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go() 
Example #8
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_lsa_ngram_pair():
    logname = "generate_feature_lsa_ngram_pair_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


# memory error (use feature_tsne.R instead) 
Example #9
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description"][:1] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                    pf.go() 
Example #10
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def run_lsa_ngram():
    logname = "generate_feature_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram, LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    obs_fields = ["search_term", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go() 
Example #11
Source File: extreme_ensemble_selection.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def main(options):

    # create sub folder
    subm_folder = "%s/ensemble_selection"%config.SUBM_DIR
    os_utils._create_dirs( [subm_folder] )
    subm_prefix = "%s/test.pred.[%s]" % (subm_folder, options.outfile)

    # get model list
    log_folder = "%s/level%d_models"%(config.LOG_DIR, options.level-1)
    model_list = get_model_list(log_folder, options.size)

    # get instance splitter
    if options.level not in [2, 3]:
        inst_splitter = None
    elif options.level == 2:
        inst_splitter = splitter_level2
    elif options.level == 3:
        inst_splitter = splitter_level3

    ees = ExtremeEnsembleSelection(
            model_folder=config.OUTPUT_DIR, 
            model_list=model_list, 
            subm_prefix=subm_prefix, 
            weight_opt_max_evals=options.weight_opt_max_evals, 
            w_min=-1., 
            w_max=1., 
            inst_subsample=options.inst_subsample,
            inst_subsample_replacement=options.inst_subsample_replacement,
            inst_splitter=inst_splitter,
            model_subsample=options.model_subsample,
            model_subsample_replacement=options.model_subsample_replacement,
            bagging_size=options.bagging_size, 
            init_top_k=options.init_top_k,
            epsilon=options.epsilon,
            multiprocessing=False, 
            multiprocessing_num_cores=config.NUM_CORES,
            enable_extreme=options.enable_extreme,
            random_seed=config.RANDOM_SEED
        )
    ees.go() 
Example #12
Source File: feature_wordnet_similarity.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def main():
    logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    # WordNet_Lch_Similarity and WordNet_Wup_Similarity are not used in final submission
    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ][:1]
    obs_fields_list = []
    target_fields_list = []
    # only search_term and product_title are used in final submission
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"][:1] )
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go() 
Example #13
Source File: feature_group_relevance.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def main():
    logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
    sf.go() 
Example #14
Source File: task.py    From NFETC with MIT License 5 votes vote down vote up
def main(options):
    time_str = datetime.datetime.now().isoformat()
    logname = "[Model@%s]_[Data@%s]_%s.log" % (options.model_name, options.data_name, time_str)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.model_name, options.data_name, options.cv_runs, options.max_evals, logger)
    optimizer.run() 
Example #15
Source File: get_stacking_feature_conf.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _create_feature_conf(level, topN, outfile):
    log_folder = "%s/level%d_models"%(config.LOG_DIR, level)
    feature_list = get_model_list(log_folder, topN)
    res = header_pattern%(__file__, level, int(topN), outfile)
    for feature in feature_list:
        res += '"%s",\n'%feature
    res += "]\n"
    with open(os.path.join(config.FEAT_CONF_DIR, outfile), "w") as f:
        f.write(res) 
Example #16
Source File: task.py    From HRERE with MIT License 5 votes vote down vote up
def main(options):
    time_str = datetime.datetime.now().isoformat()
    logname = "[Model@%s]_%s.log" % (options.model_name, time_str)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.model_name, options.max_evals, options.runs, logger)
    optimizer.run() 
Example #17
Source File: eval.py    From HRERE with MIT License 5 votes vote down vote up
def main(options):
    time_str = datetime.datetime.now().isoformat()
    logname = "Final_[Model@%s]_%s.log" % (options.model_name, time_str)
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    params_dict = param_space_dict[options.model_name]
    task = Task(options.model_name, options.runs, params_dict, logger)
    task.evaluate(options.prefix) 
Example #18
Source File: plot_feature_corr.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def grap_feat_corr_dict(fname):
    d = {}
    with open("%s/feature/%s"%(config.LOG_DIR, fname), "r") as f:
        for line in f:
            corr = grap_feat_line_corr(line)
            if corr is not None:
                name = grap_feat_line_name(line)
                d[name] = (corr)
    return d.values() 
Example #19
Source File: feature_match.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def main():
    logname = "generate_feature_match_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [
        MatchQueryCount, 
        MatchQueryRatio, 
        LongestMatchSize, 
        LongestMatchRatio, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount, 
        MatchAttrRatio, 
        IsIndoorOutdoorMatch, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_attribute_list"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go() 
Example #20
Source File: feature_first_last_ngram.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go() 
Example #21
Source File: task.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def main(options):
    logname = "[Feat@%s]_[Learner@%s]_hyperopt_%s.log"%(
        options.feature_name, options.learner_name, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    optimizer = TaskOptimizer(options.task_mode, options.learner_name, 
        options.feature_name, logger, options.max_evals, verbose=True, 
        refit_once=options.refit_once, plot_importance=options.plot_importance)
    optimizer.run() 
Example #22
Source File: feature_group_distance.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go() 
Example #23
Source File: feature_combiner.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, feature_dict, feature_name, feature_suffix=".pkl", corr_threshold=0):
        self.feature_name = feature_name
        self.feature_dict = feature_dict
        self.feature_suffix = feature_suffix
        self.corr_threshold = corr_threshold
        self.feature_names_basic = []
        self.feature_names_cv = []
        self.feature_names = []
        self.basic_only = 0
        logname = "feature_combiner_%s_%s.log"%(feature_name, time_utils._timestamp())
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        self.splitter = splitter_level1
        self.n_iter = n_iter 
Example #24
Source File: extreme_ensemble_selection.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, model_folder, model_list, subm_prefix, 
                weight_opt_max_evals=10, w_min=-1., w_max=1., 
                inst_subsample=0.5, inst_subsample_replacement=False, 
                inst_splitter=None,
                model_subsample=1.0, model_subsample_replacement=True,
                bagging_size=10, init_top_k=5, epsilon=0.00001, 
                multiprocessing=False, multiprocessing_num_cores=1,
                enable_extreme=True, random_seed=0):

        self.model_folder = model_folder
        self.model_list = model_list
        self.subm_prefix = subm_prefix
        self.weight_opt_max_evals = weight_opt_max_evals
        self.w_min = w_min
        self.w_max = w_max
        assert inst_subsample > 0 and inst_subsample <= 1.
        self.inst_subsample = inst_subsample
        self.inst_subsample_replacement = inst_subsample_replacement
        self.inst_splitter = inst_splitter
        assert model_subsample > 0
        assert (type(model_subsample) == int) or (model_subsample <= 1.)
        self.model_subsample = model_subsample
        self.model_subsample_replacement = model_subsample_replacement
        self.bagging_size = bagging_size
        self.init_top_k = init_top_k
        self.epsilon = epsilon
        self.multiprocessing = multiprocessing
        self.multiprocessing_num_cores = multiprocessing_num_cores
        self.enable_extreme = enable_extreme
        self.random_seed = random_seed
        logname = "ensemble_selection_%s.log"%time_utils._timestamp()
        self.logger = logging_utils._get_logger(config.LOG_DIR, logname)
        self.n_models = len(self.model_list) 
Example #25
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[3], [4]]
    # obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    obs_fields = ["search_term", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go() 
Example #26
Source File: feature_intersect_count.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def main():
    logname = "generate_feature_intersect_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectCount_Ngram, 
        IntersectRatio_Ngram, 
        CooccurrenceCount_Ngram, 
        CooccurrenceRatio_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go() 
Example #27
Source File: feature_stat_cooc_tfidf.py    From kaggle-HomeDepot with MIT License 4 votes vote down vote up
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    if which == "tf":
        generators.append( StatCoocTF_Ngram )
    elif which == "norm_tf":
        generators.append( StatCoocNormTF_Ngram )
    elif which == "tfidf":
        generators.append( StatCoocTFIDF_Ngram )
    elif which == "norm_tfidf":
        generators.append( StatCoocNormTFIDF_Ngram )
    elif which == "bm25":
        generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go() 
Example #28
Source File: feature_query_quality.py    From kaggle-HomeDepot with MIT License 4 votes vote down vote up
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = np_utils._dim(x)
            fname = "%s_%s_x_%s_%dD"%(ext.__name__(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go() 
Example #29
Source File: feature_basic.py    From kaggle-HomeDepot with MIT License 4 votes vote down vote up
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["search_term", "product_title", "product_description", 
                "product_attribute", "product_brand", "product_color"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["search_term", "product_title", "product_description", 
    "product_attribute", "product_brand", "product_color"]
    ngrams = [1,2,3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount, 
        AttrBulletCount, 
        AttrBulletRatio, 
        AttrNonBulletCount, 
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go() 
Example #30
Source File: feature_doc2vec.py    From kaggle-HomeDepot with MIT License 4 votes vote down vote up
def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            # Doc2Vec_Vdiff, 
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"][:1] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()