Java Code Examples for org.apache.pig.ExecType#MAPREDUCE

The following examples show how to use org.apache.pig.ExecType#MAPREDUCE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PigWrapper.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
protected PigServer createPig() throws ExecException {
    HdpBootstrap.hackHadoopStagingOnWin();

    Properties properties = HdpBootstrap.asProperties(new QueryTestParams(stagingDir).provisionQueries(HdpBootstrap.hadoopConfig()));
    String pigHost = properties.getProperty("pig");
    // remote Pig instance
    if (StringUtils.hasText(pigHost) && !"local".equals(pig)) {
        LogFactory.getLog(PigWrapper.class).info("Executing Pig in Map/Reduce mode");
        return new PigServer(ExecType.MAPREDUCE, properties);
    }

    // use local instance
    LogFactory.getLog(PigWrapper.class).info("Executing Pig in local mode");
    properties.put("mapred.job.tracker", "local");
    return new PigServer(ExecType.LOCAL, properties);
}
 
Example 2
Source File: TestMergeJoinOuter.java    From spork with Apache License 2.0 6 votes vote down vote up
@Test
public void testFailure() throws Exception{
    String query = "A = LOAD 'data1' using "+ DummyCollectableLoader.class.getName() +"() as (id, name, grade);" +
    "E = group A by id;" +
    "B = LOAD 'data2' using "+ DummyIndexableLoader.class.getName() +"() as (id, name, grade);" +
    "C = join E by A.id, B by id using 'merge';" +
    "store C into 'output';";
    LogicalPlan lp = Util.buildLp(pigServer, query);
    Operator op = lp.getSinks().get(0);
    LOJoin join = (LOJoin)lp.getPredecessors(op).get(0);
    assertEquals(LOJoin.JOINTYPE.MERGE, join.getJoinType());

    PigContext pc = new PigContext(ExecType.MAPREDUCE,cluster.getProperties());
    pc.connect();
    boolean exceptionCaught = false;
    try{
        Util.buildPp(pigServer, query);   
    }catch (FrontendException e){
        assertEquals(1103,e.getErrorCode());
        exceptionCaught = true;
    }
    assertTrue(exceptionCaught);
}
 
Example 3
Source File: TestFRJoin2.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testTooBigReplicatedFile() throws Exception {
    PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());

    pigServer.registerQuery("A = LOAD '" + INPUT_DIR + "' as (x:int,y:int);");
    pigServer.registerQuery("B = LOAD '" + INPUT_FILE + "' as (x:int,y:int);");
    pigServer.registerQuery("C = group B all parallel 5;");
    pigServer.registerQuery("C = foreach C generate MAX(B.x) as x;");
    pigServer.registerQuery("D = join A by x, B by x, C by x using 'repl';");
    {
        // When the replicated input sizes=(12 + 5) is bigger than
        // pig.join.replicated.max.bytes=16, we throw exception
        try {
            pigServer.getPigContext().getProperties().setProperty(
                    PigConfiguration.PIG_JOIN_REPLICATED_MAX_BYTES,
                    String.valueOf(16));
            pigServer.openIterator("D");
            Assert.fail();
        } catch (FrontendException e) {
            assertEquals("Internal error. Distributed cache could" +
                    " not be set up for the replicated files",
                    e.getCause().getCause().getCause().getMessage());
        }

        // If we increase the size to 17, it should work
        pigServer.getPigContext().getProperties().setProperty(
                    PigConfiguration.PIG_JOIN_REPLICATED_MAX_BYTES,
                    String.valueOf(17));
        pigServer.openIterator("D");
    }
}
 
Example 4
Source File: TestDBStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
public TestDBStorage() throws ExecException, IOException {
    // Initialise Pig server
    cluster = MiniCluster.buildCluster();
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    pigServer.getPigContext().getProperties()
            .setProperty(MRConfiguration.MAP_MAX_ATTEMPTS, "1");
    pigServer.getPigContext().getProperties()
            .setProperty(MRConfiguration.REDUCE_MAX_ATTEMPTS, "1");
    System.out.println("Pig server initialized successfully");
    TMP_DIR = System.getProperty("user.dir") + "/build/test/";
    dblocation = TMP_DIR + "batchtest";
    url = "jdbc:hsqldb:file:" + dblocation
           + ";hsqldb.default_table_type=cached;hsqldb.cache_rows=100";
    // Initialise DBServer
    dbServer = new Server();
    dbServer.setDatabaseName(0, "batchtest");
    // dbServer.setDatabasePath(0, "mem:test;sql.enforce_strict_size=true");
    dbServer.setDatabasePath(0,
                        "file:" + TMP_DIR + "batchtest;sql.enforce_strict_size=true");
    dbServer.setLogWriter(null);
    dbServer.setErrWriter(null);
    dbServer.start();
    System.out.println("Database URL: " + dbUrl);
    try {
        Class.forName(driver);
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println(this + ".setUp() error: " + e.getMessage());
    }
    System.out.println("Database server started on port: " + dbServer.getPort());
}
 
Example 5
Source File: TestFRJoin2.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
public void testSoftLinkDoesNotCreateUnnecessaryConcatJob()
              throws Exception {
    PigServer pigServer = new PigServer(ExecType.MAPREDUCE,
                                  cluster.getProperties());

    pigServer.setBatchOn();
    pigServer.getPigContext().getProperties().setProperty(
              MRCompiler.FILE_CONCATENATION_THRESHOLD, String.valueOf(FILE_MERGE_THRESHOLD));
    pigServer.getPigContext().getProperties().setProperty("pig.noSplitCombination", "false");
    String query = "A = LOAD '" + INPUT_FILE + "' as (x:int,y:int);"
                   + "B = group A all;"
                   + "C = LOAD '" + INPUT_FILE + "' as (x:int,y:int);"
                   + "D = group C by x;"
                   + "E = group D all;"
                   + "F = FOREACH E generate B.$0;"
                   + "Z = LOAD '" + INPUT_FILE + "' as (x:int,y:int);"
                   + "Y = FOREACH E generate F.$0;"
                   + "STORE Y into '/tmp/output2';";
    MROperPlan mrplan = Util.buildMRPlanWithOptimizer(Util.buildPp(pigServer, query),pigServer.getPigContext());

    // look for concat job
    for(MapReduceOper mrOp: mrplan) {
        //concatjob == map-plan load-store && reudce-plan empty
        if( mrOp.mapPlan.size() == 2 && mrOp.reducePlan.isEmpty() ) {
            fail("Somehow concatjob was created even though there is no large or multiple inputs.");
        }
    }
}
 
Example 6
Source File: TestMRExecutionEngine.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test(expected = ExecException.class)
public void testJobConfGeneration() throws ExecException {
    Configuration conf = new Configuration(false);
    conf.set("foo", "bar");
    PigContext pigContext = new PigContext(ExecType.MAPREDUCE, conf);
    // This should fail as pig expects Hadoop configs are present in
    // classpath.
    pigContext.connect();
}
 
Example 7
Source File: Util.java    From spork with Apache License 2.0 5 votes vote down vote up
public static String generateURI(String filename, PigContext context)
        throws IOException {
    if(Util.WINDOWS){
        filename = filename.replace('\\','/');
    }
    if (context.getExecType() == ExecType.MAPREDUCE || context.getExecType().name().equals("TEZ") ||
            context.getExecType().name().equals("SPARK")) {
        return FileLocalizer.hadoopify(filename, context);
    } else if (context.getExecType().isLocal()) {
        return filename;
    } else {
        throw new IllegalStateException("ExecType: " + context.getExecType());
    }
}
 
Example 8
Source File: TestMergeJoin.java    From spork with Apache License 2.0 5 votes vote down vote up
@Test
   public void testParallelism() throws Exception{
       String query = "A = LOAD '" + INPUT_FILE + "';" +
                      "B = LOAD '" + INPUT_FILE + "';" +
                      "C = join A by $0, B by $0 using 'merge' parallel 50;" + 
                      "store C into 'out';";
PigContext pc = new PigContext(ExecType.MAPREDUCE,cluster.getProperties());
   pc.connect();
MROperPlan mro = Util.buildMRPlan(Util.buildPp(pigServer, query),pc);
       Assert.assertEquals(1,mro.getRoots().get(0).getRequestedParallelism());
   }
 
Example 9
Source File: TestFRJoin.java    From spork with Apache License 2.0 5 votes vote down vote up
private void setUpHashTable() throws IOException {
    FileSpec replFile = new FileSpec(repl, new FuncSpec(PigStorage.class.getName() + "()"));
    POLoad ld = new POLoad(new OperatorKey("Repl File Loader", 1L), replFile);
    PigContext pc = new PigContext(ExecType.MAPREDUCE, PigMapReduce.sJobConfInternal.get());
    pc.connect();

    ld.setPc(pc);
    for (Result res = ld.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = ld
            .getNextTuple()) {
        Tuple tup = (Tuple)res.result;
        LoadFunc lf = ((LoadFunc)PigContext.instantiateFuncFromSpec(ld.getLFile().getFuncSpec()));
        String key = lf.getLoadCaster().bytesToCharArray(
                ((DataByteArray)tup.get(keyField)).get());
        Tuple csttup = TupleFactory.getInstance().newTuple(2);
        csttup.set(0, key);
        csttup.set(1, lf.getLoadCaster().bytesToInteger(((DataByteArray)tup.get(1)).get()));
        DataBag vals = null;
        if (replTbl.containsKey(key)) {
            vals = replTbl.get(key);
        }
        else {
            vals = BagFactory.getInstance().newDefaultBag();
            replTbl.put(key, vals);
        }
        vals.add(csttup);
    }
}
 
Example 10
Source File: TestOrderBy.java    From spork with Apache License 2.0 5 votes vote down vote up
public TestOrderBy() throws Throwable {
    DecimalFormat myFormatter = new DecimalFormat("0000000");
    for (int i = 0; i < DATALEN; i++) {
        DATA[0][i] = myFormatter.format(i);
        DATA[1][i] = myFormatter.format(DATALEN - i - 1);
    }
    pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
}
 
Example 11
Source File: TestMRCompiler.java    From spork with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void setUpBeforeClass() throws Exception {
    cluster = MiniCluster.buildCluster();
    pc = new PigContext(ExecType.LOCAL, new Properties());
    pcMR = new PigContext(ExecType.MAPREDUCE, cluster.getProperties());
    pc.connect();
}
 
Example 12
Source File: TestCounters.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testMapReduceOnly() throws IOException, ExecException {
    int count = 0;
    PrintWriter pw = new PrintWriter(Util.createInputFile(cluster, file));
    int [] nos = new int[10];
    for(int i = 0; i < 10; i++)
        nos[i] = 0;

    for(int i = 0; i < MAX; i++) {
        int index = r.nextInt(10);
        int value = r.nextInt(100);
        nos[index] += value;
        pw.println(index + "\t" + value);
    }
    pw.close();

    for(int i = 0; i < 10; i++) {
        if(nos[i] > 0) count ++;
    }

    PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    pigServer.registerQuery("a = load '" + file + "';");
    pigServer.registerQuery("b = group a by $0;");
    pigServer.registerQuery("c = foreach b generate group;");

    ExecJob job = pigServer.store("c", "output");
    PigStats pigStats = job.getStatistics();
    InputStream is = FileLocalizer.open(FileLocalizer.fullPath("output",
            pigServer.getPigContext()), pigServer.getPigContext());

    long filesize = 0;
    while(is.read() != -1) filesize++;
    
    is.close();

    cluster.getFileSystem().delete(new Path(file), true);
    cluster.getFileSystem().delete(new Path("output"), true);

    System.out.println("============================================");
    System.out.println("Test case MapReduce");
    System.out.println("============================================");

    JobGraph jp = pigStats.getJobGraph();
    Iterator<JobStats> iter = jp.iterator();
    while (iter.hasNext()) {
        MRJobStats js = (MRJobStats) iter.next();
        System.out.println("Map input records : " + js.getMapInputRecords());
        assertEquals(MAX, js.getMapInputRecords());
        System.out.println("Map output records : " + js.getMapOutputRecords());
        assertEquals(MAX, js.getMapOutputRecords());
        System.out.println("Reduce input records : " + js.getReduceInputRecords());
        assertEquals(MAX, js.getReduceInputRecords());
        System.out.println("Reduce output records : " + js.getReduceOutputRecords());
        assertEquals(count, js.getReduceOutputRecords());
    }
    System.out.println("Hdfs bytes written : " + pigStats.getBytesWritten());
    assertEquals(filesize, pigStats.getBytesWritten());
}
 
Example 13
Source File: TestCounters.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testMapOnly() throws IOException, ExecException {
    int count = 0;
    PrintWriter pw = new PrintWriter(Util.createInputFile(cluster, file));
    for(int i = 0; i < MAX; i++) {
        int t = r.nextInt(100);
        pw.println(t);
        if(t > 50) count ++;
    }
    pw.close();
    PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    pigServer.registerQuery("a = load '" + file + "';");
    pigServer.registerQuery("b = filter a by $0 > 50;");
    pigServer.registerQuery("c = foreach b generate $0 - 50;");
    ExecJob job = pigServer.store("c", "output_map_only");
    PigStats pigStats = job.getStatistics();
    
    //counting the no. of bytes in the output file
    //long filesize = cluster.getFileSystem().getFileStatus(new Path("output_map_only")).getLen();
    InputStream is = FileLocalizer.open(FileLocalizer.fullPath(
            "output_map_only", pigServer.getPigContext()), pigServer
            .getPigContext());

    long filesize = 0;
    while(is.read() != -1) filesize++;
    
    is.close();
    
    cluster.getFileSystem().delete(new Path(file), true);
    cluster.getFileSystem().delete(new Path("output_map_only"), true);

    System.out.println("============================================");
    System.out.println("Test case Map Only");
    System.out.println("============================================");

    JobGraph jg = pigStats.getJobGraph();
    Iterator<JobStats> iter = jg.iterator();
    while (iter.hasNext()) {
        MRJobStats js = (MRJobStats) iter.next();                    

        System.out.println("Map input records : " + js.getMapInputRecords());
        assertEquals(MAX, js.getMapInputRecords());
        System.out.println("Map output records : " + js.getMapOutputRecords());
        assertEquals(count, js.getMapOutputRecords());
        assertEquals(0, js.getReduceInputRecords());
        assertEquals(0, js.getReduceOutputRecords());
        System.out.println("Hdfs bytes written : " + js.getHdfsBytesWritten());
        assertEquals(filesize, js.getHdfsBytesWritten());
    }

}
 
Example 14
Source File: Util.java    From spork with Apache License 2.0 4 votes vote down vote up
public static boolean isMapredExecType(ExecType execType) {
    return execType == ExecType.MAPREDUCE;
}
 
Example 15
Source File: TestCounters.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testMultipleMRJobs() throws IOException, ExecException {
    int count = 0;
    PrintWriter pw = new PrintWriter(Util.createInputFile(cluster, file));
    int [] nos = new int[10];
    for(int i = 0; i < 10; i++)
        nos[i] = 0;

    for(int i = 0; i < MAX; i++) {
        int index = r.nextInt(10);
        int value = r.nextInt(100);
        nos[index] += value;
        pw.println(index + "\t" + value);
    }
    pw.close();

    for(int i = 0; i < 10; i++) { 
        if(nos[i] > 0) count ++;
    }

    PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
    pigServer.registerQuery("a = load '" + file + "';");
    pigServer.registerQuery("b = order a by $0;");
    pigServer.registerQuery("c = group b by $0;");
    pigServer.registerQuery("d = foreach c generate group, SUM(b.$1);");
    ExecJob job = pigServer.store("d", "output");
    PigStats pigStats = job.getStatistics();
    
    InputStream is = FileLocalizer.open(FileLocalizer.fullPath("output",
            pigServer.getPigContext()), pigServer.getPigContext());
    long filesize = 0;
    while(is.read() != -1) filesize++;
    
    is.close();
    
    cluster.getFileSystem().delete(new Path(file), true);
    cluster.getFileSystem().delete(new Path("output"), true);
    
    System.out.println("============================================");
    System.out.println("Test case MultipleMRJobs");
    System.out.println("============================================");
    
    JobGraph jp = pigStats.getJobGraph();
    MRJobStats js = (MRJobStats)jp.getSinks().get(0);
    
    System.out.println("Job id: " + js.getName());
    System.out.println(jp.toString());
    
    System.out.println("Map input records : " + js.getMapInputRecords());
    assertEquals(MAX, js.getMapInputRecords());
    System.out.println("Map output records : " + js.getMapOutputRecords());
    assertEquals(MAX, js.getMapOutputRecords());
    System.out.println("Reduce input records : " + js.getReduceInputRecords());
    assertEquals(count, js.getReduceInputRecords());
    System.out.println("Reduce output records : " + js.getReduceOutputRecords());
    assertEquals(count, js.getReduceOutputRecords());
    
    System.out.println("Hdfs bytes written : " + js.getHdfsBytesWritten());
    assertEquals(filesize, js.getHdfsBytesWritten());

}
 
Example 16
Source File: TestMergeJoinOuter.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testCompilation(){
    try{
        String query = "A = LOAD 'data1' using "+ DummyCollectableLoader.class.getName() +"() as (id, name, grade);" + 
        "B = LOAD 'data2' using "+ DummyIndexableLoader.class.getName() +"() as (id, name, grade);" +
        "C = join A by id left, B by id using 'merge';" +
        "store C into 'out';";
        LogicalPlan lp = Util.buildLp(pigServer, query);
        LOStore store = (LOStore)lp.getSinks().get(0);
        LOJoin join = (LOJoin)lp.getPredecessors(store).get(0);
        assertEquals(LOJoin.JOINTYPE.MERGE, join.getJoinType());

        PigContext pc = new PigContext(ExecType.MAPREDUCE,cluster.getProperties());
        pc.connect();
        PhysicalPlan phyP = Util.buildPp(pigServer, query);
        PhysicalOperator phyOp = phyP.getLeaves().get(0);
        assertTrue(phyOp instanceof POStore);
        phyOp = phyOp.getInputs().get(0);
        assertTrue(phyOp instanceof POForEach);
        assertEquals(1,phyOp.getInputs().size());
        assertTrue(phyOp.getInputs().get(0) instanceof POMergeCogroup);
        
        MROperPlan mrPlan = Util.buildMRPlan(phyP,pc);            
        assertEquals(2,mrPlan.size());

        Iterator<MapReduceOper> itr = mrPlan.iterator();
        List<MapReduceOper> opers = new ArrayList<MapReduceOper>();
        opers.add(itr.next());
        opers.add(itr.next());
        //Order of entrySet is not guaranteed with jdk1.7
        Collections.sort(opers);
        
        assertTrue(opers.get(0).reducePlan.isEmpty());
        assertFalse(opers.get(0).mapPlan.isEmpty());
        
        assertFalse(opers.get(1).reducePlan.isEmpty());
        assertFalse(opers.get(1).mapPlan.isEmpty());


    } catch(Exception e){
        e.printStackTrace();
        fail("Compilation of merged cogroup failed.");
    }

}
 
Example 17
Source File: TestPigServer.java    From spork with Apache License 2.0 4 votes vote down vote up
@Test
public void testExplainXmlComplex() throws Throwable {
    // TODO: Explain XML output is not supported in non-MR mode. Remove the
    // following condition once it's implemented in Tez.
    if (cluster.getExecType() != ExecType.MAPREDUCE) {
        return;
    }
    PigServer pig = new PigServer(cluster.getExecType(), properties);
    pig.registerQuery("a = load 'a' as (site: chararray, count: int, itemCounts: bag { itemCountsTuple: tuple (type: chararray, typeCount: int, f: float, m: map[]) } ) ;") ;
    pig.registerQuery("b = foreach a generate site, count, FLATTEN(itemCounts);") ;
    pig.registerQuery("c = group b by site;");
    pig.registerQuery("d = foreach c generate FLATTEN($1);");
    pig.registerQuery("e = group d by $2;");

    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    PrintStream ps = new PrintStream(baos);
    pig.explain("e", "xml", true, false, ps, ps, null, null);

    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
    Document doc = dBuilder.parse(bais);

    //Verify Logical and Physical Plans aren't supported.
    NodeList logicalPlan = doc.getElementsByTagName("logicalPlan");
    assertEquals(1, logicalPlan.getLength());
    assertTrue(logicalPlan.item(0).getTextContent().contains("Not Supported"));
    NodeList physicalPlan = doc.getElementsByTagName("physicalPlan");
    assertEquals(1, physicalPlan.getLength());
    assertTrue(physicalPlan.item(0).getTextContent().contains("Not Supported"));

    //Verify we have two loads and one is temporary
    NodeList loads = doc.getElementsByTagName("POLoad");
    assertEquals(2, loads.getLength());

    boolean sawTempLoad = false;
    boolean sawNonTempLoad = false;
    for (int i = 0; i < loads.getLength(); i++) {
        Boolean isTempLoad = null;
        boolean hasAlias = false;

        Node poLoad = loads.item(i);
        NodeList children = poLoad.getChildNodes();

        for (int j = 0; j < children.getLength(); j++) {
            Node child = children.item(j);
            if (child.getNodeName().equals("alias")) {
                hasAlias = true;
            }
            if (child.getNodeName().equals("isTmpLoad")) {
                if (child.getTextContent().equals("false")) {
                    isTempLoad = false;
                } else if (child.getTextContent().equals("true")) {
                    isTempLoad = true;
                }
            }
        }

        if (isTempLoad == null) {
            fail("POLoad elements should have isTmpLoad child node.");
        } else if (isTempLoad && hasAlias) {
            fail("Temp loads should not have aliases");
        } else if (!isTempLoad && !hasAlias) {
            fail("Non temporary loads should be associated with alias.");
        }

        sawTempLoad = sawTempLoad || isTempLoad;
        sawNonTempLoad = sawNonTempLoad || !isTempLoad;
    }

    assertTrue(sawTempLoad && sawNonTempLoad);
}
 
Example 18
Source File: TestLookupInFiles.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public void setUp() throws Exception{
    pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
}
 
Example 19
Source File: TestAlgebraicInstantiation.java    From spork with Apache License 2.0 4 votes vote down vote up
public TestAlgebraicInstantiation() throws ExecException {
    pig = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
}
 
Example 20
Source File: PigContext.java    From spork with Apache License 2.0 4 votes vote down vote up
public PigContext() {
    this(ExecType.MAPREDUCE, new Properties());
}