Python pyspark.sql.functions.explode() Examples

The following are 13 code examples of pyspark.sql.functions.explode(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyspark.sql.functions , or try the search function .
Example #1
Source File: feature_vectors.py    From search-MjoLniR with MIT License 6 votes vote down vote up
def resample_clicks_to_query_page(
    df_cluster: DataFrame,
    random_seed: Optional[int],
    samples_per_wiki: int
) -> mt.Transformer:
    # Resamples the click log by proxy of resampling clusters, such
    # that a complete cluster is either included or excluded from the
    # resulting dataset.
    # TODO: Evaluate alternative resampling, such as perhaps only dropping from
    # clusters where all clicks were to the top result (implying an "easy" search).

    mt.check_schema(df_cluster, mt.QueryClustering)
    return mt.seq_transform([
        # Grab only the parts of the query log we need to make the resulting sampled QueryPage
        lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
        mt.join_cluster_by_query(df_cluster),
        # [1] is because sample returns a tuple of (page_counts, df)
        mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
            df, random_seed, samples_per_wiki)[1]),
        lambda df: df.withColumn(
            'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
    ]) 
Example #2
Source File: swissModelDataset.py    From mmtf-pyspark with Apache License 2.0 6 votes vote down vote up
def _flatten_dataset(ds):
    '''Flattens the original hierarchical data schema into a simple row-based
    schema. Some less useful data are excluded.

    Parameters
    ----------
    ds : dataset
       the original spark dataset

    Returns
    -------
    dataset
       flattened dataset
    '''

    ds = ds.withColumn("structures", explode(ds.result.structures))
    return ds.select(col("query.ac"), col("result.sequence"), \
                     col("structures.from"), col("structures.to"), \
                     col("structures.qmean"), col("structures.qmean_norm"), \
                     col("structures.gmqe"), col("structures.coverage"), \
                     col("structures.oligo-state"), col("structures.method"), \
                     col("structures.template"), col("structures.identity"), \
                     col("structures.similarity"), col("structures.coordinates"),\
                     col("result.md5"), col("structures.md5")) 
Example #3
Source File: dataset_utils.py    From mmtf-pyspark with Apache License 2.0 6 votes vote down vote up
def flatten_dataset(dataset: DataFrame):
    tmp = dataset
    for field in tmp.schema.fields:
        if isinstance(field.dataType, ArrayType):
            print(field.name, field.dataType)
            tmp = tmp.withColumn(field.name, explode(tmp.field.name))

    return tmp 
Example #4
Source File: norm_query_clustering.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def with_unique_cluster_id(df: DataFrame) -> DataFrame:
    return (
        df
        .groupby('wikiid', 'norm_query', 'norm_query_group_id')
        .agg(F.collect_list('query').alias('queries'))
        .select(
            'wikiid', 'queries',
            F.monotonically_increasing_id().alias('cluster_id'))
        .select('wikiid', F.explode('queries').alias('query'), 'cluster_id')) 
Example #5
Source File: advancedSearchDataset.py    From mmtf-pyspark with Apache License 2.0 5 votes vote down vote up
def __get_entity_to_chain_id():
    # get entityID to strandId mapping
    query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly"
    mapping: DataFrame = pdbjMineDataset.get_dataset(query)

    # split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows
    mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ","))
    mapping = mapping.withColumn("chainId", explode("chainId"))

    # create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A
    mapping = mapping.withColumn("pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId))

    return mapping.select(mapping.entity_id, mapping.structureId, mapping.pdbChainId) 
Example #6
Source File: g2sDataset.py    From mmtf-pyspark with Apache License 2.0 5 votes vote down vote up
def _flatten_dataframe(df):
    return df.withColumn("pdbPosition", explode(col("residueMapping.pdbPosition"))) \
             .withColumn("pdbAminoAcid", explode(col("residueMapping.pdbAminoAcid"))) 
Example #7
Source File: myVariantDataset.py    From mmtf-pyspark with Apache License 2.0 5 votes vote down vote up
def _flatten_dataframe(df):
    return df.withColumn("variationId", explode(df.hits._id)) \
             .select(col("variationId"), col("uniprotId")) 
Example #8
Source File: data_fetcher.py    From ReAgent with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_distinct_keys(df, col_name, is_col_arr_map=False):
    """ Return list of distinct keys.
        Set is_col_arr_map to be true if column is an array of Maps.
        Otherwise, assume column is a Map.
    """
    if is_col_arr_map:
        df = df.select(explode(col_name).alias(col_name))
    df = df.select(explode(map_keys(col_name)))
    return df.distinct().rdd.flatMap(lambda x: x).collect() 
Example #9
Source File: addon_aggregates.py    From python_mozetl with MIT License 5 votes vote down vote up
def ms_explode_addons(ms):
    """
    Explodes the active_addons object in
    the ms DataFrame and selects relevant fields

    :param ms: a subset of main_summary
    :return SparkDF
    """
    addons_df = (
        ms.select(MS_FIELDS + [fun.explode("active_addons").alias("addons")])
        .select(MS_FIELDS + ADDON_FIELDS)
        .withColumn("app_version", fun.substring("app_version", 1, 2))
    )
    return addons_df 
Example #10
Source File: norm_query_clustering.py    From search-MjoLniR with MIT License 4 votes vote down vote up
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
    make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
        T.StructField('query', T.StringType(), nullable=False),
        T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
    ])))
    return (
        df
        .groupBy('wikiid', 'norm_query')
        .agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
        .select(
            'wikiid', 'norm_query',
            F.explode(make_groups('source')).alias('group'))
        .select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id')) 
Example #11
Source File: taar_lite_guidguid.py    From telemetry-airflow with Mozilla Public License 2.0 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #12
Source File: taar_lite_guidguid.py    From python_mozetl with MIT License 4 votes vote down vote up
def transform(longitudinal_addons):
    # Only for logging, not used, but may be interesting for later analysis.
    guid_set_unique = (
        longitudinal_addons.withColumn(
            "exploded", F.explode(longitudinal_addons.installed_addons)
        )
        .select("exploded")  # noqa: E501 - long lines
        .rdd.flatMap(lambda x: x)
        .distinct()
        .collect()
    )
    logging.info(
        "Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
    )

    restructured = longitudinal_addons.rdd.flatMap(
        lambda x: key_all(x.installed_addons)
    ).toDF(["key_addon", "coinstalled_addons"])

    # Explode the list of co-installs and count pair occurrences.
    addon_co_installations = (
        restructured.select(
            "key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
        )  # noqa: E501 - long lines
        .groupBy("key_addon", "coinstalled_addon")
        .count()
    )

    # Collect the set of coinstalled_addon, count pairs for each key_addon.
    combine_and_map_cols = F.udf(
        lambda x, y: (x, y),
        StructType([StructField("id", StringType()), StructField("n", LongType())]),
    )

    # Spark functions are sometimes long and unwieldy. Tough luck.
    # Ignore E128 and E501 long line errors
    addon_co_installations_collapsed = (
        addon_co_installations.select(  # noqa: E128
            "key_addon",
            combine_and_map_cols("coinstalled_addon", "count").alias(  # noqa: E501
                "id_n"
            ),
        )
        .groupby("key_addon")
        .agg(F.collect_list("id_n").alias("coinstallation_counts"))
    )
    logging.info(addon_co_installations_collapsed.printSchema())
    logging.info("Collecting final result of co-installations.")

    return addon_co_installations_collapsed 
Example #13
Source File: sample_scaffolds.py    From reinvent-scaffold-decorator with MIT License 4 votes vote down vote up
def run(self, initial_scaffolds):
        randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
        get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
        remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())

        results_df = self._initialize_results(initial_scaffolds)
        scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
        i = 0
        while scaffolds_df.count() > 0:
            # generate randomized SMILES
            self._log("info", "Starting iteration #%d.", i)
            scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
                .select(
                    "smiles", "scaffold", "decorations",
                    psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
                .withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
                .withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
                .withColumn("id", psf.monotonically_increasing_id())\
                .persist()
            self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
                      scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())

            # sample each randomized scaffold N times
            scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
                .rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
            self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
            self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())

            # merge decorated molecules
            joined_df = self._join_results(scaffolds_df).persist()

            if joined_df.count() > 0:
                self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
                          scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())

            scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
                .select("smiles", "scaffold", "decorations")\
                .where("smiles LIKE '%*%'")
            self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())

            results_df = results_df.union(joined_df)\
                .groupBy("smiles")\
                .agg(
                    psf.first("scaffold").alias("scaffold"),
                    psf.first("decorations").alias("decorations"),
                    psf.sum("count").alias("count"))\
                .persist()
            i += 1

        return results_df