Python pyspark.sql.functions.udf() Examples
The following are 30
code examples of pyspark.sql.functions.udf().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: metrics.py From search-MjoLniR with MIT License | 7 votes |
def _ndcg_at(k, label_col): def ndcg_at_k(predicted, actual): # TODO: Taking in rn and then re-sorting might not be necessary, but i can't # find any real guarantee that they would come in order after a groupBy + collect_list, # since they were only ordered within the window function. predicted = [row[label_col] for row in sorted(predicted, key=lambda r: r.rn)] actual = [row[label_col] for row in sorted(actual, key=lambda r: r.rn)] dcg = 0. for i, label in enumerate(predicted): # This form is used to match EvalNDCG in xgboost dcg += ((1 << label) - 1) / math.log(i + 2.0, 2) idcg = 0. for i, label in enumerate(actual): idcg += ((1 << label) - 1) / math.log(i + 2.0, 2) if idcg == 0: return 0 else: return dcg / idcg return F.udf(ndcg_at_k, pyspark.sql.types.DoubleType())
Example #2
Source File: named_image_test.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def test_featurizer_in_pipeline(self): """ Tests that featurizer fits into an MLlib Pipeline. Does not test how good the featurization is for generalization. """ featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName=self.name) lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label") pipeline = Pipeline(stages=[featurizer, lr]) # add arbitrary labels to run logistic regression # TODO: it's weird that the test fails on some combinations of labels. check why. label_udf = udf(lambda x: abs(hash(x)) % 2, IntegerType()) train_df = self.imageDF.withColumn("label", label_udf(self.imageDF["image"]["origin"])) lrModel = pipeline.fit(train_df) # see if we at least get the training examples right. # with 5 examples and e.g. 131k features (for InceptionV3), it ought to. pred_df_collected = lrModel.transform(train_df).collect() for row in pred_df_collected: self.assertEqual(int(row.prediction), row.label)
Example #3
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 6 votes |
def _join_results_single(self, scaffolds_df, sampled_df): def _join_scaffold(scaff, decs): mol = usc.join_joined_attachments(scaff, decs) if mol: return usc.to_smiles(mol) join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType()) def _create_decorations_map(decorations_smi, attachment_points): decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN) return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)} create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType())) return sampled_df.join(scaffolds_df, on="id")\ .select( join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"), create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"), "scaffold")
Example #4
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 6 votes |
def _join_results_multi(self, scaffolds_df, sampled_df): def _join_scaffold(scaff, dec): mol = usc.join(scaff, dec) if mol: return usc.to_smiles(mol) def _format_attachment_point(smi, num): smi = usc.add_first_attachment_point_number(smi, num) return usc.to_smiles(uc.to_mol(smi)) # canonicalize join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType()) format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType()) return sampled_df.join(scaffolds_df, on="id")\ .withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\ .select( join_scaffold_udf("smiles", "decoration").alias("smiles"), psf.map_concat( psf.create_map(psf.col("attachment_points")[0], SampleScaffolds.cleanup_decoration_udf("decoration")), "decorations", ).alias("decorations"), "scaffold")
Example #5
Source File: anomalies_detection_spark_streaming.py From Hanhan-Spark-Python with MIT License | 6 votes |
def detect(self, k, t): # Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) # Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = StreamingKMeans(k=7, decayFactor=1.0).setRandomCenters(4, 1.0, 0) # model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) # Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) # Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
Example #6
Source File: anomalies_detection.py From Hanhan-Spark-Python with MIT License | 6 votes |
def detect(self, k, t): # Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) # Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = KMeans.train(features, k, maxIterations=40, initializationMode="random", seed=20) # Adding the prediction column to df1 modelBC = sparkCt.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) # Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
Example #7
Source File: anomalies_detection.py From Hanhan-Spark-Python with MIT License | 6 votes |
def detect(self, k, t): #Encoding categorical features using one-hot. df1 = self.cat2Num(self.rawDF, [0, 1]).cache() df1.show(n=2, truncate=False) #Clustering points using KMeans features = df1.select("features").rdd.map(lambda row: row[0]).cache() model = KMeans.train(features, k, maxIterations=40, runs=10, initializationMode="random", seed=20) #Adding the prediction column to df1 modelBC = sc.broadcast(model) predictUDF = udf(lambda x: modelBC.value.predict(x), StringType()) df2 = df1.withColumn("prediction", predictUDF(df1.features)).cache() df2.show(n=3, truncate=False) #Adding the score column to df2; The higher the score, the more likely it is an anomaly df3 = self.addScore(df2).cache() df3.show(n=3, truncate=False) return df3.where(df3.score > t)
Example #8
Source File: udf.py From ibis with Apache License 2.0 | 6 votes |
def __call__(self, func): """Define a UDF (user-defined function) that operates element wise on a Spark DataFrame. Parameters ---------- input_type : List[ibis.expr.datatypes.DataType] A list of the types found in :mod:`~ibis.expr.datatypes`. The length of this list must match the number of arguments to the function. Variadic arguments are not yet supported. output_type : ibis.expr.datatypes.DataType The return type of the function. Examples -------- >>> import ibis >>> import ibis.expr.datatypes as dt >>> from ibis.spark.udf import udf >>> @udf.elementwise(input_type=[dt.string], output_type=dt.int64) ... def my_string_length(x): ... return len(x) * 2 """ return SparkUDF(self._input_type, self._output_type)(func)
Example #9
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def explode_features(df, features=None): """Convert feature vector into individual columns Parameters ---------- df : pyspark.sql.DataFrame features : list of str or None Returns ------- pyspark.sql.DataFrame """ if features is None: features = df.schema['features'].metadata['features'] def extract_feature(features, idx): return float(features[idx]) extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType()) cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)] return df.select('*', *cols)
Example #10
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def zero_features(df, *feature_names): """Zero out features in the feature vector. Parameters ---------- df : pyspark.sql.DataFrame feature_names : list of str Returns ------- pyspark.sql.DataFrame """ features = df.schema['features'].metadata['features'] idxs = [features.index(name) for name in feature_names] def zero_features(feat): raw = feat.toArray() for idx in idxs: raw[idx] = 0. return Vectors.dense(raw) zero_features_udf = F.udf(zero_features, VectorUDT()) return df.withColumn('features', mjolnir.spark.add_meta( df._sc, zero_features_udf('features'), {'features': features}))
Example #11
Source File: tf_image.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _convertOutputToImage(self, df, tfs_output_col, output_shape): assert len(output_shape) == 4, str(output_shape) + " does not have 4 dimensions" height = int(output_shape[1]) width = int(output_shape[2]) def to_image(orig_image, numeric_data): # Assume the returned image has float pixels but same #channels as input mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels) data = bytearray(np.array(numeric_data).astype(np.float32).tobytes()) nChannels = orig_image.nChannels return Row( origin="", mode=mode.ord, height=height, width=width, nChannels=nChannels, data=data) to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType) resDf = df.withColumn(self.getOutputCol(), to_image_udf(df[self.getInputCol()], df[tfs_output_col])) return resDf.drop(tfs_output_col)
Example #12
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def append_features(df, *cols): """Append features from columns to the features vector. Parameters ---------- df : pyspark.sql.DataFrame cols : list of str Returns ------- pyspark.sql.DataFrame """ def add_features(feat, *other): raw = feat.toArray() return Vectors.dense(np.append(raw, list(map(float, other)))) add_features_udf = F.udf(add_features, VectorUDT()) new_feat_list = df.schema['features'].metadata['features'] + cols return df.withColumn('features', mjolnir.spark.add_meta( df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
Example #13
Source File: image_params.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def loadImagesInternal(self, dataframe, inputCol): """ Load image files specified in dataset as image format specified in `sparkdl.image.imageIO`. """ # plan 1: udf(loader() + convert from np.array to imageSchema) -> call TFImageTransformer # plan 2: udf(loader()) ... we don't support np.array as a dataframe column type... loader = self.getImageLoader() # Load from external resources can fail, so we should allow None to be returned def load_image_uri_impl(uri): try: return imageArrayToStruct(_reverseChannels(loader(uri))) except BaseException: # pylint: disable=bare-except return None load_udf = udf(load_image_uri_impl, ImageSchema.imageSchema['image'].dataType) return dataframe.withColumn(self._loadedImageCol(), load_udf(dataframe[inputCol]))
Example #14
Source File: named_image.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _decodeOutputAsPredictions(self, df): # If we start having different weights than imagenet, we'll need to # move this logic to individual model building in NamedImageTransformer. # Also, we could put the computation directly in the main computation # graph or use a scala UDF for potentially better performance. topK = self.getOrDefault(self.topK) def decode(predictions): pred_arr = np.expand_dims(np.array(predictions), axis=0) decoded = decode_predictions(pred_arr, top=topK)[0] # convert numpy dtypes to python native types return [(t[0], t[1], t[2].item()) for t in decoded] decodedSchema = ArrayType( StructType([ StructField("class", StringType(), False), StructField("description", StringType(), False), StructField("probability", FloatType(), False) ])) decodeUDF = udf(decode, decodedSchema) interim_output = self._getIntermediateOutputCol() return df \ .withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \ .drop(interim_output)
Example #15
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_get_step_udf(multi_steps: Optional[int]): """ Get step count by taking length of next_states_features array. """ def get_step(col: List): return 1 if multi_steps is None else min(len(col), multi_steps) return udf(get_step, LongType())
Example #16
Source File: anomalies_detection_spark_streaming.py From Hanhan-Spark-Python with MIT License | 5 votes |
def addScore(self, df): cluster_dict = {} clusters_list = df.select("prediction").collect() for c in clusters_list: cluster_dict[c] = cluster_dict.setdefault(c,0.0)+1.0 sorted_clusters = sorted(cluster_dict.items(), key=operator.itemgetter(1)) # sort by value n_max = sorted_clusters[-1][1] n_min = sorted_clusters[0][1] score_udf = udf(lambda p: float(n_max - cluster_dict.get(Row(p)))/(n_max - n_min), DoubleType()) score_df = df.withColumn("score", score_udf(df.prediction)) return score_df
Example #17
Source File: base.py From LearningApacheSpark with MIT License | 5 votes |
def _transform(self, dataset): self.transformSchema(dataset.schema) transformUDF = udf(self.createTransformFunc(), self.outputDataType()) transformedDataset = dataset.withColumn(self.getOutputCol(), transformUDF(dataset[self.getInputCol()])) return transformedDataset
Example #18
Source File: functions.py From SMV with Apache License 2.0 | 5 votes |
def smvCreateLookUp(m, default, outputType): """Return a Python UDF which will perform a dictionary lookup on a column Args: m (dictionary): a Python dictionary to be applied default (any): default value if dictionary lookup failed outputType (DataType): output value's data type Returns: (udf): an udf which can apply to a column and apply the lookup """ return udf(lambda k: m.get(k, default), outputType)
Example #19
Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0 | 5 votes |
def get_addons_per_client(users_df, minimum_addons_count): """ Extracts a DataFrame that contains one row for each client along with the list of active add-on GUIDs. """ def is_valid_addon(addon): return not ( addon.is_system or addon.app_disabled or addon.type != "extension" or addon.user_disabled or addon.foreign_install or addon.install_day is None ) # may need additional whitelisting to remove shield addons def get_valid_addon_ids(addons): sorted_addons = sorted( [(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)], key=lambda addon_tuple: addon_tuple[1], ) return [addon_id for (addon_id, install_day) in sorted_addons] get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType())) # Create an add-ons dataset un-nesting the add-on map from each # user to a list of add-on GUIDs. Also filter undesired add-ons. return users_df.select( "client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids") ).filter(size("addon_ids") > minimum_addons_count)
Example #20
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_sparse2dense(df, col_name: str, possible_keys: List): """ Given a list of possible keys, convert sparse map to dense array. In our example, both value_type is assumed to be a float. """ output_type = StructType( [ StructField("presence", ArrayType(BooleanType()), False), StructField("dense", ArrayType(FloatType()), False), ] ) def sparse2dense(map_col): assert isinstance( map_col, dict ), f"{map_col} has type {type(map_col)} and is not a dict." presence = [] dense = [] for key in possible_keys: val = map_col.get(key, None) if val is not None: presence.append(True) dense.append(float(val)) else: presence.append(False) dense.append(0.0) return presence, dense sparse2dense_udf = udf(sparse2dense, output_type) df = df.withColumn(col_name, sparse2dense_udf(col_name)) df = df.withColumn(f"{col_name}_presence", col(f"{col_name}.presence")) df = df.withColumn(col_name, col(f"{col_name}.dense")) return df ################################################# # Below are some UDFs we use for preprocessing. # #################################################
Example #21
Source File: anomalies_detection_spark_streaming.py From Hanhan-Spark-Python with MIT License | 5 votes |
def cat2Num(self, df, indices): unique_values = [] for i in indices: d = udf(lambda r: r[i], StringType()) dt = df.select(d(df.rawFeatures)).distinct().collect() unique_values.extend(dt) unique_count = len(unique_values) convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType())) newdf = df.withColumn("features", convertUDF(df.rawFeatures)) return newdf
Example #22
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_next_udf(multi_steps: Optional[int], return_type): """ Generic udf to get next (after multi_steps) item, provided item type. """ def get_next(next_col): return ( next_col if multi_steps is None else next_col[min(len(next_col), multi_steps) - 1] ) return udf(get_next, return_type)
Example #23
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_where_udf(arr: List[str]): """ Return index of item in arr, and len(arr) if not found. """ def find(item: str): for i, arr_item in enumerate(arr): if arr_item == item: return i return len(arr) return udf(find, LongType())
Example #24
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_existence_bitvector_udf(arr: List[str]): """ one-hot encode elements of target depending on their existence in arr. """ default = [0] * len(arr) def encode(target: List[str]): bitvec = default.copy() for i, arr_item in enumerate(arr): if arr_item in target: bitvec[i] = 1 return bitvec return udf(encode, ArrayType(LongType()))
Example #25
Source File: data_fetcher.py From ReAgent with BSD 3-Clause "New" or "Revised" License | 5 votes |
def parametric_action_preprocessing( df, actions: List[str], multi_steps: Optional[int] = None, include_possible_actions: bool = True, ): assert ( not include_possible_actions ), "current we don't support include_possible_actions" next_map_udf = make_next_udf(multi_steps, MapType(LongType(), FloatType())) df = df.withColumn("next_action", next_map_udf("next_action")) def make_not_terminal_udf(): """ Return true iff next_action is an empty map """ def get_not_terminal(next_action): return len(next_action) > 0 return udf(get_not_terminal, BooleanType()) not_terminal_udf = make_not_terminal_udf() df = df.withColumn("not_terminal", not_terminal_udf("next_action")) df = make_sparse2dense(df, "action", actions) df = make_sparse2dense(df, "next_action", actions) return df
Example #26
Source File: taar_ensemble.py From python_mozetl with MIT License | 5 votes |
def get_addons_per_client(users_df, minimum_addons_count): """ Extracts a DataFrame that contains one row for each client along with the list of active add-on GUIDs. """ def is_valid_addon(addon): return not ( addon.is_system or addon.app_disabled or addon.type != "extension" or addon.user_disabled or addon.foreign_install or addon.install_day is None ) # may need additional whitelisting to remove shield addons def get_valid_addon_ids(addons): sorted_addons = sorted( [(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)], key=lambda addon_tuple: addon_tuple[1], ) return [addon_id for (addon_id, install_day) in sorted_addons] get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType())) # Create an add-ons dataset un-nesting the add-on map from each # user to a list of add-on GUIDs. Also filter undesired add-ons. return users_df.select( "client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids") ).filter(size("addon_ids") > minimum_addons_count)
Example #27
Source File: test_imageIO.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def test_udf_schema(self): # Test that utility functions can be used to create a udf that accepts and return # imageSchema def do_nothing(imgRow): array = imageIO.imageStructToArray(imgRow) return imageIO.imageArrayToStruct(array) do_nothing_udf = udf(do_nothing, ImageSchema.imageSchema['image'].dataType) df = imageIO._readImagesWithCustomFn( "file/path", decode_f=imageIO.PIL_decode, numPartition=2, sc=self.binaryFilesMock) df = df.filter(col('image').isNotNull()).withColumn("test", do_nothing_udf('image')) self.assertEqual(df.first().test.data, array.tobytes()) df.printSchema()
Example #28
Source File: TutorialClasses.py From KDD2019-HandsOn-Tutorial with MIT License | 5 votes |
def cosineSimilarity(s1, s2): (m1, v1) = Base64ToFloatArray(s1) (m2, v2) = Base64ToFloatArray(s2) if (m1 == 0) or (m2 == 0): return 0 else : return sum(x*y for x,y in zip(v1, v2))/(m1 * m2) # Register udf functions so that it could be used in dataframe # # Perform same computation as cosineSimilarity() #
Example #29
Source File: hostlinks_to_graph.py From cc-pyspark with MIT License | 5 votes |
def vertices_assign_ids(self, sc, sqlc, edges): source = edges.select(edges.s.alias('name')) target = edges.select(edges.t.alias('name')) ids = source.union(target) \ .distinct() if self.args.validate_host_names: is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid, BooleanType()) ids = ids.filter(is_valid(ids.name)) if self.args.vertex_partitions == 1: ids = ids \ .coalesce(1) \ .sort('name') \ .withColumn('id', sqlf.monotonically_increasing_id()) else: id_rdd = ids.select(ids.name).rdd \ .map(lambda row: tuple(row)[0]) \ .sortBy(lambda x: x, True, self.args.vertex_partitions) \ .zipWithIndex() id_schema = StructType([ StructField("name", StringType(), True), StructField("id", LongType(), True) ]) ids = sqlc.createDataFrame(id_rdd, schema=id_schema) if self.args.save_as_text is not None: ids = ids.persist() ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \ .write \ .text(os.path.join(self.args.save_as_text, "vertices"), compression="gzip") ids.write \ .format(self.args.output_format) \ .option("compression", self.args.output_compression) \ .saveAsTable(self.args.output + '_vertices') return ids
Example #30
Source File: converter.py From spark-sklearn with Apache License 2.0 | 5 votes |
def toPandas(self, df): """ This is similar to the Spark DataFrame built-in toPandas() method, but it handles MLlib Vector columns differently. It converts MLlib Vectors into rows of scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn. .. note:: Experimental: This will likely be replaced in later releases with improved APIs. :param df: Spark DataFrame :return: Pandas dataframe """ cols = df.columns # Convert any MLlib Vector columns to scipy.sparse.csr_matrix matrixCols = [] def toscipy(v): if isinstance(v, DenseVector): return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])), shape=(1, v.size)) elif isinstance(v, SparseVector): return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])), shape=(1, v.size)) else: raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v)) tosparse = udf(lambda v: toscipy(v), CSRVectorUDT()) for i in range(len(cols)): c = cols[i] if isinstance(df.schema.fields[i].dataType, VectorUDT): cols[i] = tosparse(df[c]).alias(c) matrixCols.append(c) else: cols[i] = df[c] return df.select(*cols).toPandas()