org.apache.pig.LoadFunc Java Examples

The following examples show how to use org.apache.pig.LoadFunc. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroStorageUtils.java    From Cubert with Apache License 2.0 6 votes vote down vote up
/**
 * Gets the list of paths from the pathString specified which may contain
 * comma-separated paths and glob style path
 *
 * @throws IOException
 */
public static Set<Path> getPaths(String pathString, Configuration conf, boolean failIfNotFound)
        throws IOException {
    Set<Path> paths = new HashSet<Path>();
    String[] pathStrs = LoadFunc.getPathStrings(pathString);
    for (String pathStr : pathStrs) {
        FileSystem fs = FileSystem.get(new Path(pathStr).toUri(), conf);
        FileStatus[] matchedFiles = fs.globStatus(new Path(pathStr), PATH_FILTER);
        if (matchedFiles == null || matchedFiles.length == 0) {
            if (failIfNotFound) {
                throw new IOException("Input Pattern " + pathStr + " matches 0 files");
            } else {
                continue;
            }
        }
        for (FileStatus file : matchedFiles) {
            paths.add(file.getPath());
        }
    }
    return paths;
}
 
Example #2
Source File: POCast.java    From spork with Apache License 2.0 6 votes vote down vote up
private void instantiateFunc() throws IOException {
    if (caster != null) return;

    if (funcSpec != null) {
        Object obj = PigContext
                .instantiateFuncFromSpec(funcSpec);
        if (obj instanceof LoadFunc) {
            caster = ((LoadFunc)obj).getLoadCaster();
        } else if (obj instanceof StreamToPig) {
            caster = ((StreamToPig)obj).getLoadCaster();
        } else {
            throw new IOException("Invalid class type "
                    + funcSpec.getClassName());
        }
    }
}
 
Example #3
Source File: POMergeJoin.java    From spork with Apache License 2.0 6 votes vote down vote up
private void seekInRightStream(Object firstLeftKey) throws IOException{
    rightLoader = (LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec);

    // check if hadoop distributed cache is used
    if (indexFile != null && rightLoader instanceof DefaultIndexableLoader) {
        DefaultIndexableLoader loader = (DefaultIndexableLoader)rightLoader;
        loader.setIndexFile(indexFile);
    }
    
    // Pass signature of the loader to rightLoader
    // make a copy of the conf to use in calls to rightLoader.
    rightLoader.setUDFContextSignature(signature);
    Job job = new Job(new Configuration(PigMapReduce.sJobConfInternal.get()));
    rightLoader.setLocation(rightInputFileName, job);
    ((IndexableLoadFunc)rightLoader).initialize(job.getConfiguration());
    ((IndexableLoadFunc)rightLoader).seekNear(
            firstLeftKey instanceof Tuple ? (Tuple)firstLeftKey : mTupleFactory.newTuple(firstLeftKey));
}
 
Example #4
Source File: DefaultIndexableLoader.java    From spork with Apache License 2.0 6 votes vote down vote up
private void initRightLoader(int [] splitsToBeRead) throws IOException{
    PigContext pc = (PigContext) ObjectSerializer
            .deserialize(PigMapReduce.sJobConfInternal.get().get("pig.pigContext"));
    
    Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties());
    
    // Hadoop security need this property to be set
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, 
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }
    
    //create ReadToEndLoader that will read the given splits in order
    loader = new ReadToEndLoader((LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec),
            conf, inpLocation, splitsToBeRead);
}
 
Example #5
Source File: TestJobControlCompiler.java    From spork with Apache License 2.0 6 votes vote down vote up
public static POLoad createPOLoadWithSize(long size, LoadFunc loadFunc) throws Exception {
    File file = File.createTempFile("tempFile", ".tmp");
    file.deleteOnExit();
    RandomAccessFile f = new RandomAccessFile(file, "rw");
    f.setLength(size);
    f.close();

    loadFunc.setLocation(file.getAbsolutePath(), new org.apache.hadoop.mapreduce.Job(CONF));
    FuncSpec funcSpec = new FuncSpec(loadFunc.getClass().getCanonicalName());
    POLoad poLoad = new POLoad(new OperatorKey(), loadFunc);
    poLoad.setLFile(new FileSpec(file.getAbsolutePath(), funcSpec));
    poLoad.setPc(new PigContext());
    poLoad.setUp();

    return poLoad;
}
 
Example #6
Source File: GruntParser.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
protected void processFsCommand(String[] cmdTokens) throws IOException {
    filter.validate(PigCommandFilter.Command.FS);
    if(mExplain == null) { // process only if not in "explain" mode

        executeBatch();

        int retCode = -1;

        try {
            retCode = shell.run(cmdTokens);
        } catch (Exception e) {
            throw new IOException(e);
        }

        if (retCode != 0 && !mInteractive) {
            String s = LoadFunc.join(
                    (AbstractList<String>) Arrays.asList(cmdTokens), " ");
            throw new IOException("fs command '" + s
                    + "' failed. Please check output logs for details");
        }
    } else {
        log.warn("'fs' statement is ignored while processing 'explain -script' or '-check'");
    }
}
 
Example #7
Source File: TezScriptState.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void visit() throws VisitorException {
    super.visit();
    if (!aliases.isEmpty()) {
        ArrayList<String> aliasList = new ArrayList<String>(aliases);
        ArrayList<String> aliasLocationList = new ArrayList<String>(aliasLocations);
        Collections.sort(aliasList);
        Collections.sort(aliasLocationList);
        alias = LoadFunc.join(aliasList, ",");
        aliasLocation = LoadFunc.join(aliasLocationList, ",");
    }
    StringBuilder sb = new StringBuilder();
    for (int i = featureSet.nextSetBit(0); i >= 0; i = featureSet.nextSetBit(i+1)) {
        if (sb.length() > 0) sb.append(",");
        sb.append(PIG_FEATURE.values()[i].name());
    }
    features = sb.toString();
}
 
Example #8
Source File: MRScriptState.java    From spork with Apache License 2.0 6 votes vote down vote up
private void setAlias(MapReduceOper mro) {
    ArrayList<String> alias = new ArrayList<String>();
    String aliasLocationStr = "";
    try {
        ArrayList<String> aliasLocation = new ArrayList<String>();
        new AliasVisitor(mro.mapPlan, alias, aliasLocation).visit();
        aliasLocationStr += "M: "+LoadFunc.join(aliasLocation, ",");
        if (mro.combinePlan != null) {
            aliasLocation = new ArrayList<String>();
            new AliasVisitor(mro.combinePlan, alias, aliasLocation).visit();
            aliasLocationStr += " C: "+LoadFunc.join(aliasLocation, ",");
        }
        aliasLocation = new ArrayList<String>();
        new AliasVisitor(mro.reducePlan, alias, aliasLocation).visit();
        aliasLocationStr += " R: "+LoadFunc.join(aliasLocation, ",");
        if (!alias.isEmpty()) {
            Collections.sort(alias);
        }
    } catch (VisitorException e) {
        LOG.warn("unable to get alias", e);
    }
    aliasMap.put(mro, LoadFunc.join(alias, ","));
    aliasLocationMap.put(mro, aliasLocationStr);
}
 
Example #9
Source File: TestBuiltin.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * test {@link TextLoader} - this also tests that {@link TextLoader} is capable
 * of reading data a couple of dirs deep when the input specified is the top
 * level directory
 */
@Test
public void testLFText() throws Exception {
    String input1 = "This is some text.\nWith a newline in it.\n";
    String expected1 = "This is some text.";
    String expected2 = "With a newline in it.";
    Util.createInputFile(cluster,
            "testLFTest-input1.txt",
            new String[] {input1});
    // check that loading the top level dir still reading the file a couple
    // of subdirs below
    LoadFunc text1 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil.
        toConfiguration(cluster.getProperties()), "testLFTest-input1.txt", 0);
    Tuple f1 = text1.getNext();
    Tuple f2 = text1.getNext();
    Util.deleteFile(cluster, "testLFTest-input1.txt");
    assertTrue(expected1.equals(f1.get(0).toString()) &&
        expected2.equals(f2.get(0).toString()));
    Util.createInputFile(cluster, "testLFTest-input2.txt", new String[] {});
    LoadFunc text2 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil.
        toConfiguration(cluster.getProperties()), "testLFTest-input2.txt", 0);
    Tuple f3 = text2.getNext();
    Util.deleteFile(cluster, "testLFTest-input2.txt");
    assertTrue(f3 == null);
}
 
Example #10
Source File: AvroStorageUtils.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * Gets the list of paths from the pathString specified which may contain
 * comma-separated paths and glob style path
 *
 * @throws IOException
 */
public static Set<Path> getPaths(String pathString, Configuration conf, boolean failIfNotFound)
        throws IOException {
    Set<Path> paths = new HashSet<Path>();
    String[] pathStrs = LoadFunc.getPathStrings(pathString);
    for (String pathStr : pathStrs) {
        FileSystem fs = FileSystem.get(new Path(pathStr).toUri(), conf);
        FileStatus[] matchedFiles = fs.globStatus(new Path(pathStr), PATH_FILTER);
        if (matchedFiles == null || matchedFiles.length == 0) {
            if (failIfNotFound) {
                throw new IOException("Input Pattern " + pathStr + " matches 0 files");
            } else {
                continue;
            }
        }
        for (FileStatus file : matchedFiles) {
            paths.add(file.getPath());
        }
    }
    return paths;
}
 
Example #11
Source File: PigInputFormat.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * Pass loader signature to LoadFunc and to InputFormat through
 * the conf
 * @param loadFunc the Loadfunc to set the signature on
 * @param inputIndex the index of the input corresponding to the loadfunc
 * @param conf the Configuration object into which the signature should be
 * set
 * @throws IOException on failure
 */
@SuppressWarnings("unchecked")
static void passLoadSignature(LoadFunc loadFunc, int inputIndex,
        Configuration conf) throws IOException {
    List<String> inpSignatureLists =
            (ArrayList<String>)ObjectSerializer.deserialize(
                    conf.get("pig.inpSignatures"));
    // signature can be null for intermediate jobs where it will not
    // be required to be passed down
    if(inpSignatureLists.get(inputIndex) != null) {
        loadFunc.setUDFContextSignature(inpSignatureLists.get(inputIndex));
        conf.set("pig.loader.signature", inpSignatureLists.get(inputIndex));
    }

    MapRedUtil.setupUDFContext(conf);
}
 
Example #12
Source File: PathPartitionHelper.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
    * Reads the partition keys from the location i.e the base directory
    * 
    * @param location
    *            String must be the base directory for the partitions
    * @param conf
    * @param loaderClass
    * @throws IOException
    */
   public void setPartitionKeys(String location, Configuration conf,
    Class<? extends LoadFunc> loaderClass, String signature)
    throws IOException {

Set<String> partitionKeys = getPartitionKeys(location, conf);

if (partitionKeys != null) {
    StringBuilder buff = new StringBuilder();
    int i = 0;
    for (String key : partitionKeys) {
	if (i++ != 0) {
	    buff.append(",");
	}

	buff.append(key);
    }

    UDFContext.getUDFContext()
	    .getUDFProperties(loaderClass, new String[] { signature })
	    .setProperty(PARTITION_COLUMNS, buff.toString());
}

   }
 
Example #13
Source File: LOLoad.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * Used from the LogicalPlanBuilder
 *
 * @param loader FuncSpec for load function to use for this load.
 * @param schema schema user specified in script, or null if not specified.
 * @param plan logical plan this load is part of.
 * @param conf
 * @param loadFunc the LoadFunc that was instantiated from loader
 * @param signature the signature that will be passed to the LoadFunc
 */
public LOLoad(FileSpec loader, LogicalSchema schema, LogicalPlan plan, Configuration conf, LoadFunc loadFunc, String signature) {
    super("LOLoad", plan);
    this.scriptSchema = schema;
    this.fs = loader;
    this.schemaFile = loader == null ? null : loader.getFileName();
    this.conf = conf;
    this.loadFunc = loadFunc;
    this.signature = signature;
    storeScriptSchema(conf, scriptSchema, signature);
    if (loadFunc != null) {
        this.loadFunc.setUDFContextSignature(signature);
        try {
            this.determinedSchema = getSchemaFromMetaData();
        } catch (FrontendException e) {
            throw new RuntimeException("Can not retrieve schema from loader " + loadFunc, e);
        }
    } else {
        this.determinedSchema = null;
    }
}
 
Example #14
Source File: PigRecordReader.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * @param context
 *
 */
public PigRecordReader(InputFormat<?, ?> inputformat, PigSplit pigSplit,
        LoadFunc loadFunc, TaskAttemptContext context, long limit) throws IOException, InterruptedException {
    this.inputformat = inputformat;
    this.pigSplit = pigSplit;
    this.loadfunc = loadFunc;
    this.context = context;
    this.reporter = PigStatusReporter.getInstance();
    this.inputSpecificConf = context.getConfiguration();
    curReader = null;
    progress = 0;
    idx = 0;
    this.limit = limit;
    initNextRecordReader();
    doTiming = inputSpecificConf.getBoolean(PIG_UDF_PROFILE, false);
    if (doTiming) {
        counterGroup = loadFunc.toString();
        timingFrequency = inputSpecificConf.getLong(PIG_UDF_PROFILE_FREQUENCY, 100L);
    }
}
 
Example #15
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetAbsolutePath3() throws IOException {
    // test case: remote hdfs path
    String absPath = "hdfs://myhost.mydomain:37765/data/passwd";
    Assert.assertEquals(absPath,
            LoadFunc.getAbsolutePath(absPath, curHdfsDir));      
}
 
Example #16
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testCommaSeparatedString4() throws Exception {
    // test case: comma separated paths with hadoop glob
    Assert.assertEquals(curHdfsRoot + "/user/pig/test/{a,c}," + 
            curHdfsRoot + "/user/pig/test/b",
            LoadFunc.getAbsolutePath("test/{a,c},test/b", curHdfsDir));
}
 
Example #17
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testCommaSeparatedString6() throws Exception {
    // test case: comma separated paths with hasoop glob
    Assert.assertEquals(curHdfsRoot + "/user/pig/test/{a,c},/test/data/b",
            LoadFunc.getAbsolutePath("test/{a,c},/test/data/b",
                    curHdfsDir));
}
 
Example #18
Source File: TestPOCast.java    From spork with Apache License 2.0 5 votes vote down vote up
private PhysicalPlan constructPlan(POCast op) throws IOException {
       LoadFunc load = new TestLoader();
       op.setFuncSpec(new FuncSpec(load.getClass().getName()));
       POProject prj = new POProject(new OperatorKey("", r.nextLong()), -1, 0);
       PhysicalPlan plan = new PhysicalPlan();
       plan.add(prj);
       plan.add(op);
       plan.connect(prj, op);
       prj.setResultType(DataType.BYTEARRAY);
       return plan;
}
 
Example #19
Source File: TestBuiltin.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testSFPig() throws Exception {
    Util.resetStateForExecModeSwitch();
    PigServer mrPigServer = new PigServer(cluster.getExecType(), properties);
    String inputStr = "amy\tbob\tcharlene\tdavid\terin\tfrank";
    Util.createInputFile(cluster, "testSFPig-input.txt", new String[]
                                                                {inputStr});
    DataByteArray[] input = { new DataByteArray("amy"),
        new DataByteArray("bob"), new DataByteArray("charlene"),
        new DataByteArray("david"), new DataByteArray("erin"),
        new DataByteArray("frank") };
    Tuple f1 = Util.loadTuple(TupleFactory.getInstance().
            newTuple(input.length), input);
    String outputLocation = "testSFPig-output.txt";
    String query = "a = load 'testSFPig-input.txt';" +
            "store a into '" + outputLocation + "';";
    mrPigServer.setBatchOn();
    Util.registerMultiLineQuery(mrPigServer, query);
    mrPigServer.executeBatch();
    LoadFunc lfunc = new ReadToEndLoader(new PigStorage(), ConfigurationUtil.
        toConfiguration(cluster.getProperties()), outputLocation, 0);
    Tuple f2 = lfunc.getNext();
    Util.deleteFile(cluster, "testSFPig-input.txt");

    Util.deleteFile(cluster, outputLocation);
    assertEquals(f1, f2);
}
 
Example #20
Source File: AllLoader.java    From spork with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public void initialize(InputSplit inputSplit,
        TaskAttemptContext taskAttemptContext) throws IOException,
        InterruptedException {

    FileSplit fileSplit = (FileSplit) inputSplit;

    path = fileSplit.getPath();
    String fileName = path.toUri().toString();

    // select the correct load function and initialise
    loadFuncHelper = new LoadFuncHelper(
            taskAttemptContext.getConfiguration());

    FuncSpec funcSpec = loadFuncHelper.determineFunction(fileName);

    if (funcSpec == null) {
        throw new IOException("Cannot determine LoadFunc for "
                + fileName);
    }

    selectedLoadFunc = (LoadFunc) PigContext
            .instantiateFuncFromSpec(funcSpec);

    selectedLoadFunc.setUDFContextSignature(udfSignature);
    selectedLoadFunc.setLocation(fileName,
            new Job(taskAttemptContext.getConfiguration(),
                    taskAttemptContext.getJobName()));

    selectedReader = selectedLoadFunc.getInputFormat()
            .createRecordReader(fileSplit, taskAttemptContext);

    selectedReader.initialize(fileSplit, taskAttemptContext);

    LOG.info("Using LoadFunc " + selectedLoadFunc.getClass().getName()
            + " on " + fileName);

}
 
Example #21
Source File: PathPartitionHelper.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
    * Sets the PARITITION_FILTER_EXPRESSION property in the UDFContext
    * identified by the loaderClass.
    * 
    * @param partitionFilterExpression
    * @param loaderClass
    * @throws IOException
    */
   public void setPartitionFilterExpression(String partitionFilterExpression,
    Class<? extends LoadFunc> loaderClass, String signature)
    throws IOException {

UDFContext
	.getUDFContext()
	.getUDFProperties(loaderClass, new String[] { signature })
	.setProperty(PARITITION_FILTER_EXPRESSION,
		partitionFilterExpression);

   }
 
Example #22
Source File: TestAvroStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private static String getInputFile(String file) {
    String locations[] = LoadFunc.getPathStrings(file);
    if (locations.length == 1)
        return System.getProperty("user.dir") + "/" + basedir
                + file;
    else {
        ArrayList<String> pathStrings = new ArrayList<String>();
        for (int index = 0; index < locations.length; index++) {
            String f = System.getProperty("user.dir") + "/"
                    + basedir + locations[index].trim();
            pathStrings.add(f);
        }
        return LoadFunc.join(pathStrings, ",");
    }
}
 
Example #23
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testHarUrl() throws Exception {
    // test case: input location is a har:// url
    Assert.assertEquals("har:///user/pig/harfile",
            LoadFunc.getAbsolutePath("har:///user/pig/harfile",
                    curHdfsDir));
}
 
Example #24
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testCommaSeparatedString() throws Exception {
    // test case: comma separated absolute paths
    Assert.assertEquals("/usr/pig/a,/usr/pig/b,/usr/pig/c",
            LoadFunc.getAbsolutePath("/usr/pig/a,/usr/pig/b,/usr/pig/c", 
                    curHdfsDir));
}
 
Example #25
Source File: TestLoadFunc.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetAbsolutePath4() throws IOException {
    // test case: non dfs scheme
    Assert.assertEquals("http://myhost:12345/data/passwd",
            LoadFunc.getAbsolutePath("http://myhost:12345/data/passwd", 
            curHdfsDir));      
}
 
Example #26
Source File: TestFRJoin.java    From spork with Apache License 2.0 5 votes vote down vote up
private void setUpHashTable() throws IOException {
    FileSpec replFile = new FileSpec(repl, new FuncSpec(PigStorage.class.getName() + "()"));
    POLoad ld = new POLoad(new OperatorKey("Repl File Loader", 1L), replFile);
    PigContext pc = new PigContext(ExecType.MAPREDUCE, PigMapReduce.sJobConfInternal.get());
    pc.connect();

    ld.setPc(pc);
    for (Result res = ld.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = ld
            .getNextTuple()) {
        Tuple tup = (Tuple)res.result;
        LoadFunc lf = ((LoadFunc)PigContext.instantiateFuncFromSpec(ld.getLFile().getFuncSpec()));
        String key = lf.getLoadCaster().bytesToCharArray(
                ((DataByteArray)tup.get(keyField)).get());
        Tuple csttup = TupleFactory.getInstance().newTuple(2);
        csttup.set(0, key);
        csttup.set(1, lf.getLoadCaster().bytesToInteger(((DataByteArray)tup.get(1)).get()));
        DataBag vals = null;
        if (replTbl.containsKey(key)) {
            vals = replTbl.get(key);
        }
        else {
            vals = BagFactory.getInstance().newDefaultBag();
            replTbl.put(key, vals);
        }
        vals.add(csttup);
    }
}
 
Example #27
Source File: PigInputFormat.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * @param inputIndex
 * @param conf
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private static LoadFunc getLoadFunc(int inputIndex, Configuration conf) throws IOException {
    ArrayList<FileSpec> inputs =
            (ArrayList<FileSpec>) ObjectSerializer.deserialize(
                    conf.get(PIG_INPUTS));
    FuncSpec loadFuncSpec = inputs.get(inputIndex).getFuncSpec();
    return (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
}
 
Example #28
Source File: PigInputFormat.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * get the corresponding configuration for the input on which the split
 * is based and merge it with the Conf supplied
 *
 * package level access so that this is not publicly used elsewhere
 * @throws IOException
 */
static void mergeSplitSpecificConf(LoadFunc loadFunc, PigSplit pigSplit, Configuration originalConf)
        throws IOException {
    // set up conf with entries from input specific conf
    Job job = new Job(originalConf);
    loadFunc.setLocation(getLoadLocation(pigSplit.getInputIndex(),
            originalConf), job);
    // The above setLocation call could write to the conf within
    // the job - merge that updated conf with original conf
    ConfigurationUtil.mergeConf(originalConf, job.getConfiguration());

}
 
Example #29
Source File: InputSizeReducerEstimator.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * Get the input size for as many inputs as possible. Inputs that do not report
 * their size nor can pig look that up itself are excluded from this size.
 * 
 * @param conf Configuration
 * @param lds List of POLoads
 * @param job Job
 * @param max Maximum value of total input size that will trigger exit. Many
 * times we're only interested whether the total input size is greater than
 * X or not. In such case, we can exit the function early as soon as the max
 * is reached.
 * @return
 * @throws IOException
 */
static long getTotalInputFileSize(Configuration conf,
        List<POLoad> lds, Job job, long max) throws IOException {
    long totalInputFileSize = 0;
    for (POLoad ld : lds) {
        long size = getInputSizeFromLoader(ld, job);
        if (size > -1) {
            totalInputFileSize += size;
            continue;
        } else {

            // the input file location might be a list of comma separated files,
            // separate them out
            for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) {
                if (UriUtil.isHDFSFileOrLocalOrS3N(location, conf)) {
                    Path path = new Path(location);
                    FileSystem fs = path.getFileSystem(conf);
                    FileStatus[] status = fs.globStatus(path);
                    if (status != null) {
                        for (FileStatus s : status) {
                            totalInputFileSize += MapRedUtil.getPathLength(fs, s, max);
                            if (totalInputFileSize > max) {
                                break;
                            }
                        }
                    } else {
                        // If file is not found, we should report -1
                        return -1;
                    }
                } else {
                    // If we cannot estimate size of a location, we should report -1
                    return -1;
                }
            }
        }
    }
    return totalInputFileSize;
}
 
Example #30
Source File: PigHadoopLogger.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("rawtypes")
public void warn(Object o, String msg, Enum warningEnum) {
    String className = o.getClass().getName();
    String displayMessage = className + "(" + warningEnum + "): " + msg;

    if (getAggregate()) {
        if (reporter != null) {
            // log at least once
            if (msgMap.get(o) == null || !msgMap.get(o).equals(displayMessage)) {
                log.warn(displayMessage);
                msgMap.put(o, displayMessage);
            }
            if (o instanceof EvalFunc || o instanceof LoadFunc || o instanceof StoreFunc) {
                reporter.incrCounter(className, warningEnum.name(), 1);
            }
            // For backwards compatibility, always report with warningEnum, see PIG-3739
            reporter.incrCounter(warningEnum, 1);
        } else {
            //TODO:
            //in local mode of execution if the PigHadoopLogger is used initially,
            //then aggregation cannot be performed as the reporter will be null.
            //The reference to a reporter is given by Hadoop at run time.
            //In local mode, due to the absence of Hadoop there will be no reporter
            //Just print the warning message as is.
            //If a warning message is printed in map reduce mode when aggregation
            //is turned on then we have a problem, its a bug.
            log.warn(displayMessage);
        }
    } else {
        log.warn(displayMessage);
    }
}