Python pyspark.sql.functions.rand() Examples

The following are 6 code examples of pyspark.sql.functions.rand(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: taar_ensemble.py    From telemetry-airflow with Mozilla Public License 2.0 6 votes vote down vote up
def cross_validation_split(dataset, k_folds):
    """
  Splits dataframe into k_folds, returning array of dataframes
  """
    dataset_split = []
    h = 1.0 / k_folds
    df = dataset.select("*", rand().alias("rand"))

    for i in range(k_folds):
        validateLB = i * h
        validateUB = (i + 1) * h
        condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB)
        fold = df.filter(condition).cache()
        dataset_split.append(fold)

    return dataset_split 
Example #2
Source File: taar_ensemble.py    From python_mozetl with MIT License 6 votes vote down vote up
def cross_validation_split(dataset, k_folds):
    """
  Splits dataframe into k_folds, returning array of dataframes
  """
    dataset_split = []
    h = 1.0 / k_folds
    df = dataset.select("*", rand().alias("rand"))

    for i in range(k_folds):
        validateLB = i * h
        validateUB = (i + 1) * h
        condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB)
        fold = df.filter(condition).cache()
        dataset_split.append(fold)

    return dataset_split 
Example #3
Source File: tests.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _transform(self, dataset):
        return dataset.withColumn("prediction",
                                  dataset.feature + (rand(0) * self.getInducedError())) 
Example #4
Source File: tuning.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [[None for j in range(numModels)] for i in range(nFolds)]

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition).cache()
            train = df.filter(~condition).cache()

            tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
            for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
                metrics[j] += (metric / nFolds)
                if collectSubModelsParam:
                    subModels[i][j] = subModel

            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels)) 
Example #5
Source File: tuning.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        condition = (df[randCol] >= tRatio)
        validation = df.filter(condition).cache()
        train = df.filter(~condition).cache()

        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [None for i in range(numModels)]

        tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        metrics = [None] * numModels
        for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
            metrics[j] = metric
            if collectSubModelsParam:
                subModels[j] = subModel

        train.unpersist()
        validation.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel, metrics, subModels)) 
Example #6
Source File: utils.py    From dist-keras with GNU General Public License v3.0 5 votes vote down vote up
def shuffle(dataset):
    """Shuffles the rows in the specified Spark Dataframe.

    # Arguments
        dataset: dataframe. A Spark Dataframe.
    """
    dataset = dataset.orderBy(rand())
    dataset.cache()

    return dataset