/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.analyzer.physical; import static com.linkedin.cubert.utils.JsonUtils.getText; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.io.Storage; import com.linkedin.cubert.io.StorageFactory; import com.linkedin.cubert.operator.PostCondition; import com.linkedin.cubert.utils.FileSystemUtils; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.print; /** * Analyzes the data dependency across the jobs in the program. * * Rewrites the plan to add following fields in the plan: * * @author Maneesh Varshney * */ public class DependencyAnalyzer extends PhysicalPlanVisitor implements PlanRewriter { /** * Input and outputs for a job. * * @author Maneesh Varshney * */ private static final class JobInputsOutputs { private final Set<String> inputs = new HashSet<String>(); private final Set<String> outputs = new HashSet<String>(); private final Map<String, String> typeMap = new HashMap<String, String>(); private final Map<String, JsonNode> jsonMap = new HashMap<String, JsonNode>(); void addInput(String type, String name, JsonNode json) { inputs.add(name); typeMap.put(name, type); jsonMap.put(name, json); } void addOutput(String type, String name, JsonNode json) { outputs.add(name); typeMap.put(name, type); jsonMap.put(name, json); } } private final List<JobInputsOutputs> jobDependency = new ArrayList<JobInputsOutputs>(); private JobInputsOutputs currentJob; private final Configuration conf = new JobConf(); private boolean revisit = false; @Override public JsonNode rewrite(JsonNode plan, Set<String> namesUsed, boolean debugMode, boolean revisit) throws IOException { this.revisit = revisit; new PhysicalPlanWalker(plan, this).walk(); return plan; } @Override public void enterJob(JsonNode json) { currentJob = new JobInputsOutputs(); } @Override public void visitInput(JsonNode json) { String type = getText(json, "type"); for (JsonNode pathJson : json.path("path")) { currentJob.addInput(type, JsonUtils.encodePath(pathJson), json); } } @Override public void visitOperator(JsonNode json, boolean isMapper) { String type = getText(json, "operator"); if (type.equals("LOAD_CACHED_FILE")) { try { String pathWithoutFragment = new URI(getText(json, "path")).getPath(); currentJob.addInput(getText(json, "type"), pathWithoutFragment, json); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else if (type.equals("TEE")) { currentJob.addOutput(getText(json, "type"), getText(json, "path"), json); } else if (type.equals("DICT_ENCODE") || type.equals("DICT_DECODE")) { if (json.has("path")) { try { String pathWithoutFragment = new URI(getText(json, "path")).getPath(); currentJob.addInput("AVRO", pathWithoutFragment, json); } catch (URISyntaxException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } @Override public void visitOutput(JsonNode json) { currentJob.addOutput(getText(json, "type"), getText(json, "path"), json); } @Override public void visitCachedIndex(JsonNode json) { currentJob.addInput("RUBIX", getText(json, "path"), json); } @Override public void exitJob(JsonNode json) { jobDependency.add(currentJob); } @Override public void exitProgram(JsonNode json) { ObjectMapper mapper = new ObjectMapper(); // Create reverse dependencies. // jobInputMap: From file name => jobId that needs this data as input // jobOutputMap: From file name => jobId that generates this data as output Map<String, List<Integer>> jobInputMap = new HashMap<String, List<Integer>>(); Map<String, List<Integer>> jobOutputMap = new HashMap<String, List<Integer>>(); int jobId = 0; for (JobInputsOutputs dep : jobDependency) { createReverseDependency(jobId, dep.inputs, jobInputMap); createReverseDependency(jobId, dep.outputs, jobOutputMap); jobId++; } // Infer data dependency across the jobs. // Strategy: look at inputs for a given job. If the input is generated by some // other job, then we have a dependency. // If the input is not generated by any other job, then this input is a "global" // input to the program (which we will handle later in this function). Set<String> programInputs = new HashSet<String>(); jobId = 0; for (JobInputsOutputs dep : jobDependency) { // upstreamJob: all jobs that generate data that this job needs as input Set<Integer> upstreamJobs = new HashSet<Integer>(); for (String input : dep.inputs) { // upstreamJobsForInput: all jobs that generate this particular input List<Integer> upstreamJobsForInput = jobOutputMap.get(input); if (upstreamJobsForInput == null) { programInputs.add(input); } else { if (isParentJob(upstreamJobsForInput, jobId)) { upstreamJobs.addAll(upstreamJobsForInput); } else programInputs.add(input); } } // Add the inferred dependency in the json. // The json syntax is {"dependsOn": [job_id, job_id...]} ArrayNode arrayNode = mapper.createArrayNode(); for (Integer job : upstreamJobs) arrayNode.add((int) job); ((ObjectNode) json.get("jobs").get(jobId)).put("dependsOn", arrayNode); jobId++; } // Now handle the "program inputs". These are data files that are not generated by // any job, that is, they are input to the program. // First, merge the typeMap (and jsonMap) from all jobDependency. Map<String, String> typeMap = new HashMap<String, String>(); Map<String, JsonNode> jsonMap = new HashMap<String, JsonNode>(); for (JobInputsOutputs dep : jobDependency) { for (String input : dep.typeMap.keySet()) { String type = dep.typeMap.get(input); if (typeMap.containsKey(input) && !typeMap.get(input).equals(type)) { throw new IllegalArgumentException(String.format("Dataset [%s] is used within the program with different types: %s and %s", input, type, typeMap.get(input))); } typeMap.put(input, type); jsonMap.put(input, dep.jsonMap.get(input)); } } print.f("[Dependency Analyzer] Program inputs: %s", programInputs); // Next, we obtain the schema of the program input files, // and put them in the json ObjectNode programInputsJson; if (revisit) programInputsJson = (ObjectNode) ((ObjectNode) json).get("input"); else { programInputsJson = mapper.createObjectNode(); ((ObjectNode) json).put("input", programInputsJson); } ((ObjectNode) json).put("input", programInputsJson); for (String input : programInputs) { String type = typeMap.get(input); JsonNode inputJson = jsonMap.get(input); try { // Schema for a data set already present. if (programInputsJson.get(input) != null) continue; PostCondition condition = getPostCondition(input, inputJson, type); ObjectNode node = mapper.createObjectNode(); node.put("type", type); node.put("schema", condition.getSchema().toJson()); if (condition.getPartitionKeys() != null) node.put("partitionKeys", JsonUtils.createArrayNode(condition.getPartitionKeys())); if (condition.getSortKeys() != null) node.put("sortKeys", JsonUtils.createArrayNode(condition.getSortKeys())); programInputsJson.put(input, node); } catch (IOException e) { throw new RuntimeException(e); } } } private void createReverseDependency(int jobId, Set<String> datasets, Map<String, List<Integer>> map) { for (String input : datasets) { List<Integer> jobs = map.get(input); if (jobs == null) { jobs = new ArrayList<Integer>(); map.put(input, jobs); } jobs.add(jobId); } } private PostCondition getPostCondition(String input, JsonNode json, String typeStr) throws IOException { FileSystem fs = FileSystem.get(conf); JsonNode pathJson = JsonUtils.decodePath(input); JsonNode params = json.get("params"); Path path = null; List<Path> paths = FileSystemUtils.getPaths(fs, pathJson, true, params); path = paths.get(0); Storage storage = StorageFactory.get(typeStr); return storage.getPostCondition(conf, json, path); } private boolean isParentJob(List<Integer> candidates, int jobId) { for (Integer cand : candidates) if (cand.intValue() < jobId) return true; return false; } }