org.apache.pig.LoadFunc Java Examples
The following examples show how to use
org.apache.pig.LoadFunc.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroStorageUtils.java From Cubert with Apache License 2.0 | 6 votes |
/** * Gets the list of paths from the pathString specified which may contain * comma-separated paths and glob style path * * @throws IOException */ public static Set<Path> getPaths(String pathString, Configuration conf, boolean failIfNotFound) throws IOException { Set<Path> paths = new HashSet<Path>(); String[] pathStrs = LoadFunc.getPathStrings(pathString); for (String pathStr : pathStrs) { FileSystem fs = FileSystem.get(new Path(pathStr).toUri(), conf); FileStatus[] matchedFiles = fs.globStatus(new Path(pathStr), PATH_FILTER); if (matchedFiles == null || matchedFiles.length == 0) { if (failIfNotFound) { throw new IOException("Input Pattern " + pathStr + " matches 0 files"); } else { continue; } } for (FileStatus file : matchedFiles) { paths.add(file.getPath()); } } return paths; }
Example #2
Source File: POCast.java From spork with Apache License 2.0 | 6 votes |
private void instantiateFunc() throws IOException { if (caster != null) return; if (funcSpec != null) { Object obj = PigContext .instantiateFuncFromSpec(funcSpec); if (obj instanceof LoadFunc) { caster = ((LoadFunc)obj).getLoadCaster(); } else if (obj instanceof StreamToPig) { caster = ((StreamToPig)obj).getLoadCaster(); } else { throw new IOException("Invalid class type " + funcSpec.getClassName()); } } }
Example #3
Source File: POMergeJoin.java From spork with Apache License 2.0 | 6 votes |
private void seekInRightStream(Object firstLeftKey) throws IOException{ rightLoader = (LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec); // check if hadoop distributed cache is used if (indexFile != null && rightLoader instanceof DefaultIndexableLoader) { DefaultIndexableLoader loader = (DefaultIndexableLoader)rightLoader; loader.setIndexFile(indexFile); } // Pass signature of the loader to rightLoader // make a copy of the conf to use in calls to rightLoader. rightLoader.setUDFContextSignature(signature); Job job = new Job(new Configuration(PigMapReduce.sJobConfInternal.get())); rightLoader.setLocation(rightInputFileName, job); ((IndexableLoadFunc)rightLoader).initialize(job.getConfiguration()); ((IndexableLoadFunc)rightLoader).seekNear( firstLeftKey instanceof Tuple ? (Tuple)firstLeftKey : mTupleFactory.newTuple(firstLeftKey)); }
Example #4
Source File: DefaultIndexableLoader.java From spork with Apache License 2.0 | 6 votes |
private void initRightLoader(int [] splitsToBeRead) throws IOException{ PigContext pc = (PigContext) ObjectSerializer .deserialize(PigMapReduce.sJobConfInternal.get().get("pig.pigContext")); Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); // Hadoop security need this property to be set if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } //create ReadToEndLoader that will read the given splits in order loader = new ReadToEndLoader((LoadFunc)PigContext.instantiateFuncFromSpec(rightLoaderFuncSpec), conf, inpLocation, splitsToBeRead); }
Example #5
Source File: TestJobControlCompiler.java From spork with Apache License 2.0 | 6 votes |
public static POLoad createPOLoadWithSize(long size, LoadFunc loadFunc) throws Exception { File file = File.createTempFile("tempFile", ".tmp"); file.deleteOnExit(); RandomAccessFile f = new RandomAccessFile(file, "rw"); f.setLength(size); f.close(); loadFunc.setLocation(file.getAbsolutePath(), new org.apache.hadoop.mapreduce.Job(CONF)); FuncSpec funcSpec = new FuncSpec(loadFunc.getClass().getCanonicalName()); POLoad poLoad = new POLoad(new OperatorKey(), loadFunc); poLoad.setLFile(new FileSpec(file.getAbsolutePath(), funcSpec)); poLoad.setPc(new PigContext()); poLoad.setUp(); return poLoad; }
Example #6
Source File: GruntParser.java From spork with Apache License 2.0 | 6 votes |
@Override protected void processFsCommand(String[] cmdTokens) throws IOException { filter.validate(PigCommandFilter.Command.FS); if(mExplain == null) { // process only if not in "explain" mode executeBatch(); int retCode = -1; try { retCode = shell.run(cmdTokens); } catch (Exception e) { throw new IOException(e); } if (retCode != 0 && !mInteractive) { String s = LoadFunc.join( (AbstractList<String>) Arrays.asList(cmdTokens), " "); throw new IOException("fs command '" + s + "' failed. Please check output logs for details"); } } else { log.warn("'fs' statement is ignored while processing 'explain -script' or '-check'"); } }
Example #7
Source File: TezScriptState.java From spork with Apache License 2.0 | 6 votes |
@Override public void visit() throws VisitorException { super.visit(); if (!aliases.isEmpty()) { ArrayList<String> aliasList = new ArrayList<String>(aliases); ArrayList<String> aliasLocationList = new ArrayList<String>(aliasLocations); Collections.sort(aliasList); Collections.sort(aliasLocationList); alias = LoadFunc.join(aliasList, ","); aliasLocation = LoadFunc.join(aliasLocationList, ","); } StringBuilder sb = new StringBuilder(); for (int i = featureSet.nextSetBit(0); i >= 0; i = featureSet.nextSetBit(i+1)) { if (sb.length() > 0) sb.append(","); sb.append(PIG_FEATURE.values()[i].name()); } features = sb.toString(); }
Example #8
Source File: MRScriptState.java From spork with Apache License 2.0 | 6 votes |
private void setAlias(MapReduceOper mro) { ArrayList<String> alias = new ArrayList<String>(); String aliasLocationStr = ""; try { ArrayList<String> aliasLocation = new ArrayList<String>(); new AliasVisitor(mro.mapPlan, alias, aliasLocation).visit(); aliasLocationStr += "M: "+LoadFunc.join(aliasLocation, ","); if (mro.combinePlan != null) { aliasLocation = new ArrayList<String>(); new AliasVisitor(mro.combinePlan, alias, aliasLocation).visit(); aliasLocationStr += " C: "+LoadFunc.join(aliasLocation, ","); } aliasLocation = new ArrayList<String>(); new AliasVisitor(mro.reducePlan, alias, aliasLocation).visit(); aliasLocationStr += " R: "+LoadFunc.join(aliasLocation, ","); if (!alias.isEmpty()) { Collections.sort(alias); } } catch (VisitorException e) { LOG.warn("unable to get alias", e); } aliasMap.put(mro, LoadFunc.join(alias, ",")); aliasLocationMap.put(mro, aliasLocationStr); }
Example #9
Source File: TestBuiltin.java From spork with Apache License 2.0 | 6 votes |
/** * test {@link TextLoader} - this also tests that {@link TextLoader} is capable * of reading data a couple of dirs deep when the input specified is the top * level directory */ @Test public void testLFText() throws Exception { String input1 = "This is some text.\nWith a newline in it.\n"; String expected1 = "This is some text."; String expected2 = "With a newline in it."; Util.createInputFile(cluster, "testLFTest-input1.txt", new String[] {input1}); // check that loading the top level dir still reading the file a couple // of subdirs below LoadFunc text1 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "testLFTest-input1.txt", 0); Tuple f1 = text1.getNext(); Tuple f2 = text1.getNext(); Util.deleteFile(cluster, "testLFTest-input1.txt"); assertTrue(expected1.equals(f1.get(0).toString()) && expected2.equals(f2.get(0).toString())); Util.createInputFile(cluster, "testLFTest-input2.txt", new String[] {}); LoadFunc text2 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "testLFTest-input2.txt", 0); Tuple f3 = text2.getNext(); Util.deleteFile(cluster, "testLFTest-input2.txt"); assertTrue(f3 == null); }
Example #10
Source File: AvroStorageUtils.java From spork with Apache License 2.0 | 6 votes |
/** * Gets the list of paths from the pathString specified which may contain * comma-separated paths and glob style path * * @throws IOException */ public static Set<Path> getPaths(String pathString, Configuration conf, boolean failIfNotFound) throws IOException { Set<Path> paths = new HashSet<Path>(); String[] pathStrs = LoadFunc.getPathStrings(pathString); for (String pathStr : pathStrs) { FileSystem fs = FileSystem.get(new Path(pathStr).toUri(), conf); FileStatus[] matchedFiles = fs.globStatus(new Path(pathStr), PATH_FILTER); if (matchedFiles == null || matchedFiles.length == 0) { if (failIfNotFound) { throw new IOException("Input Pattern " + pathStr + " matches 0 files"); } else { continue; } } for (FileStatus file : matchedFiles) { paths.add(file.getPath()); } } return paths; }
Example #11
Source File: PigInputFormat.java From spork with Apache License 2.0 | 6 votes |
/** * Pass loader signature to LoadFunc and to InputFormat through * the conf * @param loadFunc the Loadfunc to set the signature on * @param inputIndex the index of the input corresponding to the loadfunc * @param conf the Configuration object into which the signature should be * set * @throws IOException on failure */ @SuppressWarnings("unchecked") static void passLoadSignature(LoadFunc loadFunc, int inputIndex, Configuration conf) throws IOException { List<String> inpSignatureLists = (ArrayList<String>)ObjectSerializer.deserialize( conf.get("pig.inpSignatures")); // signature can be null for intermediate jobs where it will not // be required to be passed down if(inpSignatureLists.get(inputIndex) != null) { loadFunc.setUDFContextSignature(inpSignatureLists.get(inputIndex)); conf.set("pig.loader.signature", inpSignatureLists.get(inputIndex)); } MapRedUtil.setupUDFContext(conf); }
Example #12
Source File: PathPartitionHelper.java From spork with Apache License 2.0 | 6 votes |
/** * Reads the partition keys from the location i.e the base directory * * @param location * String must be the base directory for the partitions * @param conf * @param loaderClass * @throws IOException */ public void setPartitionKeys(String location, Configuration conf, Class<? extends LoadFunc> loaderClass, String signature) throws IOException { Set<String> partitionKeys = getPartitionKeys(location, conf); if (partitionKeys != null) { StringBuilder buff = new StringBuilder(); int i = 0; for (String key : partitionKeys) { if (i++ != 0) { buff.append(","); } buff.append(key); } UDFContext.getUDFContext() .getUDFProperties(loaderClass, new String[] { signature }) .setProperty(PARTITION_COLUMNS, buff.toString()); } }
Example #13
Source File: LOLoad.java From spork with Apache License 2.0 | 6 votes |
/** * Used from the LogicalPlanBuilder * * @param loader FuncSpec for load function to use for this load. * @param schema schema user specified in script, or null if not specified. * @param plan logical plan this load is part of. * @param conf * @param loadFunc the LoadFunc that was instantiated from loader * @param signature the signature that will be passed to the LoadFunc */ public LOLoad(FileSpec loader, LogicalSchema schema, LogicalPlan plan, Configuration conf, LoadFunc loadFunc, String signature) { super("LOLoad", plan); this.scriptSchema = schema; this.fs = loader; this.schemaFile = loader == null ? null : loader.getFileName(); this.conf = conf; this.loadFunc = loadFunc; this.signature = signature; storeScriptSchema(conf, scriptSchema, signature); if (loadFunc != null) { this.loadFunc.setUDFContextSignature(signature); try { this.determinedSchema = getSchemaFromMetaData(); } catch (FrontendException e) { throw new RuntimeException("Can not retrieve schema from loader " + loadFunc, e); } } else { this.determinedSchema = null; } }
Example #14
Source File: PigRecordReader.java From spork with Apache License 2.0 | 6 votes |
/** * @param context * */ public PigRecordReader(InputFormat<?, ?> inputformat, PigSplit pigSplit, LoadFunc loadFunc, TaskAttemptContext context, long limit) throws IOException, InterruptedException { this.inputformat = inputformat; this.pigSplit = pigSplit; this.loadfunc = loadFunc; this.context = context; this.reporter = PigStatusReporter.getInstance(); this.inputSpecificConf = context.getConfiguration(); curReader = null; progress = 0; idx = 0; this.limit = limit; initNextRecordReader(); doTiming = inputSpecificConf.getBoolean(PIG_UDF_PROFILE, false); if (doTiming) { counterGroup = loadFunc.toString(); timingFrequency = inputSpecificConf.getLong(PIG_UDF_PROFILE_FREQUENCY, 100L); } }
Example #15
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testGetAbsolutePath3() throws IOException { // test case: remote hdfs path String absPath = "hdfs://myhost.mydomain:37765/data/passwd"; Assert.assertEquals(absPath, LoadFunc.getAbsolutePath(absPath, curHdfsDir)); }
Example #16
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testCommaSeparatedString4() throws Exception { // test case: comma separated paths with hadoop glob Assert.assertEquals(curHdfsRoot + "/user/pig/test/{a,c}," + curHdfsRoot + "/user/pig/test/b", LoadFunc.getAbsolutePath("test/{a,c},test/b", curHdfsDir)); }
Example #17
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testCommaSeparatedString6() throws Exception { // test case: comma separated paths with hasoop glob Assert.assertEquals(curHdfsRoot + "/user/pig/test/{a,c},/test/data/b", LoadFunc.getAbsolutePath("test/{a,c},/test/data/b", curHdfsDir)); }
Example #18
Source File: TestPOCast.java From spork with Apache License 2.0 | 5 votes |
private PhysicalPlan constructPlan(POCast op) throws IOException { LoadFunc load = new TestLoader(); op.setFuncSpec(new FuncSpec(load.getClass().getName())); POProject prj = new POProject(new OperatorKey("", r.nextLong()), -1, 0); PhysicalPlan plan = new PhysicalPlan(); plan.add(prj); plan.add(op); plan.connect(prj, op); prj.setResultType(DataType.BYTEARRAY); return plan; }
Example #19
Source File: TestBuiltin.java From spork with Apache License 2.0 | 5 votes |
@Test public void testSFPig() throws Exception { Util.resetStateForExecModeSwitch(); PigServer mrPigServer = new PigServer(cluster.getExecType(), properties); String inputStr = "amy\tbob\tcharlene\tdavid\terin\tfrank"; Util.createInputFile(cluster, "testSFPig-input.txt", new String[] {inputStr}); DataByteArray[] input = { new DataByteArray("amy"), new DataByteArray("bob"), new DataByteArray("charlene"), new DataByteArray("david"), new DataByteArray("erin"), new DataByteArray("frank") }; Tuple f1 = Util.loadTuple(TupleFactory.getInstance(). newTuple(input.length), input); String outputLocation = "testSFPig-output.txt"; String query = "a = load 'testSFPig-input.txt';" + "store a into '" + outputLocation + "';"; mrPigServer.setBatchOn(); Util.registerMultiLineQuery(mrPigServer, query); mrPigServer.executeBatch(); LoadFunc lfunc = new ReadToEndLoader(new PigStorage(), ConfigurationUtil. toConfiguration(cluster.getProperties()), outputLocation, 0); Tuple f2 = lfunc.getNext(); Util.deleteFile(cluster, "testSFPig-input.txt"); Util.deleteFile(cluster, outputLocation); assertEquals(f1, f2); }
Example #20
Source File: AllLoader.java From spork with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; path = fileSplit.getPath(); String fileName = path.toUri().toString(); // select the correct load function and initialise loadFuncHelper = new LoadFuncHelper( taskAttemptContext.getConfiguration()); FuncSpec funcSpec = loadFuncHelper.determineFunction(fileName); if (funcSpec == null) { throw new IOException("Cannot determine LoadFunc for " + fileName); } selectedLoadFunc = (LoadFunc) PigContext .instantiateFuncFromSpec(funcSpec); selectedLoadFunc.setUDFContextSignature(udfSignature); selectedLoadFunc.setLocation(fileName, new Job(taskAttemptContext.getConfiguration(), taskAttemptContext.getJobName())); selectedReader = selectedLoadFunc.getInputFormat() .createRecordReader(fileSplit, taskAttemptContext); selectedReader.initialize(fileSplit, taskAttemptContext); LOG.info("Using LoadFunc " + selectedLoadFunc.getClass().getName() + " on " + fileName); }
Example #21
Source File: PathPartitionHelper.java From spork with Apache License 2.0 | 5 votes |
/** * Sets the PARITITION_FILTER_EXPRESSION property in the UDFContext * identified by the loaderClass. * * @param partitionFilterExpression * @param loaderClass * @throws IOException */ public void setPartitionFilterExpression(String partitionFilterExpression, Class<? extends LoadFunc> loaderClass, String signature) throws IOException { UDFContext .getUDFContext() .getUDFProperties(loaderClass, new String[] { signature }) .setProperty(PARITITION_FILTER_EXPRESSION, partitionFilterExpression); }
Example #22
Source File: TestAvroStorage.java From spork with Apache License 2.0 | 5 votes |
private static String getInputFile(String file) { String locations[] = LoadFunc.getPathStrings(file); if (locations.length == 1) return System.getProperty("user.dir") + "/" + basedir + file; else { ArrayList<String> pathStrings = new ArrayList<String>(); for (int index = 0; index < locations.length; index++) { String f = System.getProperty("user.dir") + "/" + basedir + locations[index].trim(); pathStrings.add(f); } return LoadFunc.join(pathStrings, ","); } }
Example #23
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testHarUrl() throws Exception { // test case: input location is a har:// url Assert.assertEquals("har:///user/pig/harfile", LoadFunc.getAbsolutePath("har:///user/pig/harfile", curHdfsDir)); }
Example #24
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testCommaSeparatedString() throws Exception { // test case: comma separated absolute paths Assert.assertEquals("/usr/pig/a,/usr/pig/b,/usr/pig/c", LoadFunc.getAbsolutePath("/usr/pig/a,/usr/pig/b,/usr/pig/c", curHdfsDir)); }
Example #25
Source File: TestLoadFunc.java From spork with Apache License 2.0 | 5 votes |
@Test public void testGetAbsolutePath4() throws IOException { // test case: non dfs scheme Assert.assertEquals("http://myhost:12345/data/passwd", LoadFunc.getAbsolutePath("http://myhost:12345/data/passwd", curHdfsDir)); }
Example #26
Source File: TestFRJoin.java From spork with Apache License 2.0 | 5 votes |
private void setUpHashTable() throws IOException { FileSpec replFile = new FileSpec(repl, new FuncSpec(PigStorage.class.getName() + "()")); POLoad ld = new POLoad(new OperatorKey("Repl File Loader", 1L), replFile); PigContext pc = new PigContext(ExecType.MAPREDUCE, PigMapReduce.sJobConfInternal.get()); pc.connect(); ld.setPc(pc); for (Result res = ld.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = ld .getNextTuple()) { Tuple tup = (Tuple)res.result; LoadFunc lf = ((LoadFunc)PigContext.instantiateFuncFromSpec(ld.getLFile().getFuncSpec())); String key = lf.getLoadCaster().bytesToCharArray( ((DataByteArray)tup.get(keyField)).get()); Tuple csttup = TupleFactory.getInstance().newTuple(2); csttup.set(0, key); csttup.set(1, lf.getLoadCaster().bytesToInteger(((DataByteArray)tup.get(1)).get())); DataBag vals = null; if (replTbl.containsKey(key)) { vals = replTbl.get(key); } else { vals = BagFactory.getInstance().newDefaultBag(); replTbl.put(key, vals); } vals.add(csttup); } }
Example #27
Source File: PigInputFormat.java From spork with Apache License 2.0 | 5 votes |
/** * @param inputIndex * @param conf * @return * @throws IOException */ @SuppressWarnings("unchecked") private static LoadFunc getLoadFunc(int inputIndex, Configuration conf) throws IOException { ArrayList<FileSpec> inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize( conf.get(PIG_INPUTS)); FuncSpec loadFuncSpec = inputs.get(inputIndex).getFuncSpec(); return (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec); }
Example #28
Source File: PigInputFormat.java From spork with Apache License 2.0 | 5 votes |
/** * get the corresponding configuration for the input on which the split * is based and merge it with the Conf supplied * * package level access so that this is not publicly used elsewhere * @throws IOException */ static void mergeSplitSpecificConf(LoadFunc loadFunc, PigSplit pigSplit, Configuration originalConf) throws IOException { // set up conf with entries from input specific conf Job job = new Job(originalConf); loadFunc.setLocation(getLoadLocation(pigSplit.getInputIndex(), originalConf), job); // The above setLocation call could write to the conf within // the job - merge that updated conf with original conf ConfigurationUtil.mergeConf(originalConf, job.getConfiguration()); }
Example #29
Source File: InputSizeReducerEstimator.java From spork with Apache License 2.0 | 5 votes |
/** * Get the input size for as many inputs as possible. Inputs that do not report * their size nor can pig look that up itself are excluded from this size. * * @param conf Configuration * @param lds List of POLoads * @param job Job * @param max Maximum value of total input size that will trigger exit. Many * times we're only interested whether the total input size is greater than * X or not. In such case, we can exit the function early as soon as the max * is reached. * @return * @throws IOException */ static long getTotalInputFileSize(Configuration conf, List<POLoad> lds, Job job, long max) throws IOException { long totalInputFileSize = 0; for (POLoad ld : lds) { long size = getInputSizeFromLoader(ld, job); if (size > -1) { totalInputFileSize += size; continue; } else { // the input file location might be a list of comma separated files, // separate them out for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) { if (UriUtil.isHDFSFileOrLocalOrS3N(location, conf)) { Path path = new Path(location); FileSystem fs = path.getFileSystem(conf); FileStatus[] status = fs.globStatus(path); if (status != null) { for (FileStatus s : status) { totalInputFileSize += MapRedUtil.getPathLength(fs, s, max); if (totalInputFileSize > max) { break; } } } else { // If file is not found, we should report -1 return -1; } } else { // If we cannot estimate size of a location, we should report -1 return -1; } } } } return totalInputFileSize; }
Example #30
Source File: PigHadoopLogger.java From spork with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("rawtypes") public void warn(Object o, String msg, Enum warningEnum) { String className = o.getClass().getName(); String displayMessage = className + "(" + warningEnum + "): " + msg; if (getAggregate()) { if (reporter != null) { // log at least once if (msgMap.get(o) == null || !msgMap.get(o).equals(displayMessage)) { log.warn(displayMessage); msgMap.put(o, displayMessage); } if (o instanceof EvalFunc || o instanceof LoadFunc || o instanceof StoreFunc) { reporter.incrCounter(className, warningEnum.name(), 1); } // For backwards compatibility, always report with warningEnum, see PIG-3739 reporter.incrCounter(warningEnum, 1); } else { //TODO: //in local mode of execution if the PigHadoopLogger is used initially, //then aggregation cannot be performed as the reporter will be null. //The reference to a reporter is given by Hadoop at run time. //In local mode, due to the absence of Hadoop there will be no reporter //Just print the warning message as is. //If a warning message is printed in map reduce mode when aggregation //is turned on then we have a problem, its a bug. log.warn(displayMessage); } } else { log.warn(displayMessage); } }