# Copyright 2019-2020 The Kale Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import pprint import tempfile import logging import logging.handlers import nbformat as nb import networkx as nx from kubernetes.config import ConfigException from kale.nbparser import parser from kale.static_analysis import dependencies, ast from kale.codegen import generate_code from kale.utils import utils, graph_utils from kale.utils.pod_utils import get_docker_base_image from kale.utils.metadata_utils import parse_metadata from kale.utils.log_utils import get_or_create_logger KALE_NOTEBOOK_METADATA_KEY = 'kubeflow_notebook' class Kale: """Use this class to convert a Notebook to a KFP py executable.""" def __init__(self, source_notebook_path: str, notebook_metadata_overrides: dict = None, debug: bool = False, auto_snapshot: bool = False): self.auto_snapshot = auto_snapshot self.source_path = str(source_notebook_path) if not os.path.exists(self.source_path): raise ValueError("Path {} does not exist".format(self.source_path)) # read notebook self.notebook = nb.read(self.source_path, as_version=nb.NO_CONVERT) # read Kale notebook metadata. # In case it is not specified get an empty dict notebook_metadata = self.notebook.metadata.get( KALE_NOTEBOOK_METADATA_KEY, dict()) # override notebook metadata with provided arguments if notebook_metadata_overrides: notebook_metadata.update(notebook_metadata_overrides) # validate metadata and apply transformations when needed self.pipeline_metadata = parse_metadata(notebook_metadata) # used to set container step working dir same as current environment abs_working_dir = utils.get_abs_working_dir(self.source_path) self.pipeline_metadata['abs_working_dir'] = abs_working_dir self.detect_environment() # set up logging level = logging.DEBUG if debug else logging.INFO log_path = os.path.join(".", "kale.log") self.logger = get_or_create_logger(module=__name__, level=level, log_path=log_path) # mute other loggers logging.getLogger('urllib3.connectionpool').setLevel(logging.CRITICAL) def detect_environment(self): """Detect local confs to preserve reproducibility in pipeline steps.""" # When running inside a Kubeflow Notebook Server we can detect the # running docker image and use it as default in the pipeline steps. if not self.pipeline_metadata['docker_image']: docker_image = "" try: # will fail in case in cluster config is not found docker_image = get_docker_base_image() except ConfigException: # no K8s config found # use kfp default image pass except Exception: # some other exception raise self.pipeline_metadata["docker_image"] = docker_image def notebook_to_graph(self): """Convert an annotated Notebook to a Graph.""" # convert notebook to nx graph (pipeline_graph, pipeline_parameters_source, pipeline_metrics_source, imports_and_functions) = parser.parse_notebook(self.notebook) # get a dict from the 'pipeline parameters' cell source code pipeline_parameters_dict = ast.parse_assignments_expressions( pipeline_parameters_source) # get a list of variables that need to be logged as pipeline metrics pipeline_metrics = ast.parse_metrics_print_statements( pipeline_metrics_source) # run static analysis over the source code dependencies.dependencies_detection( pipeline_graph, pipeline_parameters=pipeline_parameters_dict, imports_and_functions=imports_and_functions ) dependencies.assign_metrics(pipeline_graph, pipeline_metrics) # if there are multiple DAG leaves, add an empty step at the end of the # pipeline for final snapshot leaf_steps = graph_utils.get_leaf_nodes(pipeline_graph) if self.auto_snapshot and len(leaf_steps) > 1: auto_snapshot_name = 'final_auto_snapshot' # add a link from all the last steps of the pipeline to # the final auto snapshot one. for node in leaf_steps: pipeline_graph.add_edge(node, auto_snapshot_name) data = {auto_snapshot_name: {'source': '', 'ins': [], 'outs': []}} nx.set_node_attributes(pipeline_graph, data) # TODO: Additional Step required: # Run a static analysis over every step to check that pipeline # parameters are not assigned with new values. return pipeline_graph, pipeline_parameters_dict def generate_kfp_executable(self, pipeline_graph, pipeline_parameters, save_to_tmp=False): """Generate a Python executable starting from a Graph.""" self.logger.debug("------------- Kale Start Run -------------") # generate full kfp pipeline definition gen_args = {"nb_graph": pipeline_graph, "nb_path": os.path.abspath(self.source_path), "pipeline_parameters": pipeline_parameters, "metadata": self.pipeline_metadata, "auto_snapshot": self.auto_snapshot} kfp_code = generate_code.gen_kfp_code(**gen_args) if save_to_tmp: output_path = None else: notebook_dir = os.path.dirname(self.source_path) filename = "{}.kale.py".format( self.pipeline_metadata['pipeline_name']) output_path = os.path.abspath(os.path.join(notebook_dir, filename)) # save kfp generated code output_path = self.save_pipeline(kfp_code, output_path) return output_path def print_pipeline(self, pipeline_graph): """Prints a complete definition of the pipeline with all the tags.""" for block_name in nx.topological_sort(pipeline_graph): block_data = pipeline_graph.nodes(data=True)[block_name] print("Block: {}".format(block_name)) print("Previous Blocks:") if 'previous_blocks' in block_data['tags']: pprint.pprint(block_data['tags']['previous_blocks'], width=1) print("Ins") if 'ins' in block_data: pprint.pprint(sorted(block_data['ins']), width=1) print("Outs") if 'outs' in block_data: pprint.pprint(sorted(block_data['outs']), width=1) print() print("-------------------------------") print() def to_dot(self, graph, dot_path): """Write the graph to a dot file. Args: graph: NetworkX graph instance dot_path: Path to .dot file location """ nx.drawing.nx_pydot.write_dot(graph, dot_path) def save_pipeline(self, pipeline_code, output_path=None): """Save Python code to file.""" if output_path is None: # create tmp path tmp_dir = tempfile.mkdtemp() filename = "kale_pipeline_code_{}.py".format( utils.random_string(5)) output_path = os.path.join(tmp_dir, filename) with open(output_path, "w") as f: f.write(pipeline_code) self.logger.info("Pipeline code saved at {}".format(output_path)) return output_path