# # Copyright (C) 2019 Databricks, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import sys from distutils.version import LooseVersion from databricks.koalas.version import __version__ def assert_pyspark_version(): import logging pyspark_ver = None try: import pyspark except ImportError: raise ImportError( "Unable to import pyspark - consider doing a pip install with [spark] " "extra to install pyspark with pip" ) else: pyspark_ver = getattr(pyspark, "__version__") if pyspark_ver is None or pyspark_ver < "2.4": logging.warning( 'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format( pyspark_ver if pyspark_ver is not None else "<unknown version>" ) ) assert_pyspark_version() import pyspark import pyarrow if LooseVersion(pyspark.__version__) < LooseVersion("3.0"): if ( LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and "ARROW_PRE_0_15_IPC_FORMAT" not in os.environ ): import logging logging.warning( "'ARROW_PRE_0_15_IPC_FORMAT' environment variable was not set. It is required to " "set this environment variable to '1' in both driver and executor sides if you use " "pyarrow>=0.15 and pyspark<3.0. " "Koalas will set it for you but it does not work if there is a Spark context already " "launched." ) # This is required to support PyArrow 0.15 in PySpark versions lower than 3.0. # See SPARK-29367. os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1" elif "ARROW_PRE_0_15_IPC_FORMAT" in os.environ: raise RuntimeError( "Please explicitly unset 'ARROW_PRE_0_15_IPC_FORMAT' environment variable in both " "driver and executor sides. It is required to set this environment variable only " "when you use pyarrow>=0.15 and pyspark<3.0." ) from databricks.koalas.frame import DataFrame from databricks.koalas.indexes import Index, MultiIndex from databricks.koalas.series import Series from databricks.koalas.config import get_option, set_option, reset_option, options from databricks.koalas.groupby import NamedAgg __all__ = [ "read_csv", "read_parquet", "to_datetime", "from_pandas", "get_dummies", "DataFrame", "Series", "Index", "MultiIndex", "sql", "range", "concat", "melt", "get_option", "set_option", "reset_option", "read_sql_table", "read_sql_query", "read_sql", "options", "option_context", "NamedAgg", ] def _auto_patch_spark(): import os import logging # Attach a usage logger. logger_module = os.getenv("KOALAS_USAGE_LOGGER", None) if logger_module is not None: try: from databricks.koalas import usage_logging usage_logging.attach(logger_module) except Exception as e: logger = logging.getLogger("databricks.koalas.usage_logger") logger.warning( "Tried to attach usage logger `{}`, but an exception was raised: {}".format( logger_module, str(e) ) ) # Autopatching is on by default. x = os.getenv("SPARK_KOALAS_AUTOPATCH", "true") if x.lower() in ("true", "1", "enabled"): logger = logging.getLogger("spark") logger.info( "Patching spark automatically. You can disable it by setting " "SPARK_KOALAS_AUTOPATCH=false in your environment" ) from pyspark.sql import dataframe as df df.DataFrame.to_koalas = DataFrame.to_koalas def _auto_patch_pandas(): import pandas as pd # In order to use it in test cases. global _frame_has_class_getitem global _series_has_class_getitem _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__") _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__") if sys.version_info >= (3, 7): # Just in case pandas implements '__class_getitem__' later. if not _frame_has_class_getitem: pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params) if not _series_has_class_getitem: pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params) _auto_patch_spark() _auto_patch_pandas() # Import after the usage logger is attached. from databricks.koalas.config import * from databricks.koalas.namespace import * from databricks.koalas.sql import sql