dachisgroup.com




Dachis Group
Las Vegas 2012




  Pig Unit Testing


     Clint Miller
     Pigout Hackday, Austin TX
     May 11, 2012
® 2011 Dachis Group.
dachisgroup.com




What is PigUnit?

  • Not really a *Unit framework.
  • Library that you can use within your JUnit tests that allows you to
                 • Run your Pig scripts from within your JUnit tests.
                 • Override variables in your Pig scripts so that they get values from your JUnit
                   tests rather than reading external sources such as HDFS.
                 • Inspect the values of your Pig script variables.
                 • Make your STORE statements into no-ops so that your Pig scripts run
                   without side effects.




® 2011 Dachis Group.
dachisgroup.com




Simple Pig Script

  minutes_and_goals = LOAD 'minutes_and_goals' USING BinStorage() AS (
              name: chararray,
              team: chararray,
              minutes: long,
              goals: long
            );

  top_goal_scorers = FILTER minutes_and_goals BY goals >= $MIN_GOALS;

  minutes_per_goal_unsorted = FOREACH top_goal_scorers
                 GENERATE name, minutes/goals AS minutes_per_goal;

  minutes_per_goal = ORDER minutes_per_goal_unsorted BY minutes_per_goal;

  STORE minutes_per_goal INTO 'minutes_per_goal' USING BinStorage();




® 2011 Dachis Group.
dachisgroup.com




Simple Test Program

     public void testSamplePigScript() throws Exception {
       String[] args = {
          "MIN_GOALS=20"
       };

         PigTest test = new PigTest("/Users/clintmiller/blah/sampleScript.pig", args);

         String[] input = {
            "BenzematReal Madridt2165t20",
            "RonaldotReal Madridt3264t45",
            "FalcaotAtletico Madridt2852t23",
            "MessitBarcelonat3177t50",
            "XavitBarcelonat2079t10",
            "HiguaintReal Madridt1641t22",
            "SancheztBarcelonat1678t12"
         };

         String[] expectedOutput = {
            "(Messi,63)",
            "(Ronaldo,72)",
            "(Higuain,74)",
            "(Benzema,108)",
            "(Falcao,124)"
         };

         test.assertOutput("minutes_and_goals", input, "minutes_per_goal", expectedOutput);
     }

® 2011 Dachis Group.
dachisgroup.com




More Complex Pig Script
(reads two input files)


  players = LOAD 'minutes_and_goals' USING BinStorage() AS (
          name: chararray,
          team: chararray,
          minutes: long,
          goals: long
        );

  teams = LOAD 'team_goals' USING BinStorage() AS (
        name: chararray,
        goals: long
      );

  player_and_team = JOIN players BY team, teams BY name;

  percent_of_team_goals_unsorted = FOREACH player_and_team
                    GENERATE players::name, teams::name,
                         (players::goals * 100) / teams::goals
                         AS percent_of_team_goals;

  percent_of_team_goals = ORDER percent_of_team_goals_unsorted
                BY percent_of_team_goals DESC, teams::name;

  STORE percent_of_team_goals INTO 'percent_of_team_goals' USING BinStorage();




® 2011 Dachis Group.
dachisgroup.com




Methods on PigTest

  Iterator<Tuple> getAlias(String alias);

  Iterator<Tuple> getAlias(); // Fetches value of last variable used in a STORE command

  void override(String alias, String query);

  void unoverride(String alias);

  void assertOutput(String[] expected);

  void assertOutput(String alias, String[] expected);

  void assertOutput(File expected);

  void assertOutput(String alias, File expected);

  void assertOutput(String aliasInput, String[] input, String alias, String[] expected);



                            There is no simple way to override the
                            values of multiple input variables!




® 2011 Dachis Group.
dachisgroup.com




Method override() Saves the Day!
   public class InputMocker {
       protected PigTest test;
       protected PigServer pigServer;
       protected Cluster cluster;
       protected List<String> overrideFiles;

         public InputMocker(PigTest test, PigServer pigServer, Cluster cluster) {
           this.test = test;
           this.pigServer = pigServer;
           this.cluster = cluster;
           this.overrideFiles = new ArrayList<String>();
         }

         public void mockInputAlias(String alias, String[] input) throws Exception {
           test.runScript();

             StringBuilder sb = new StringBuilder();
             Schema.stringifySchema(sb, pigServer.dumpSchema(alias), DataType.TUPLE);

             String destination = alias + "-pigunit-input-overridden.txt";
             overrideFiles.add(destination);

             cluster.copyFromLocalFile(input, destination, true);
             test.override(alias,
                      String.format("%s = LOAD '%s' AS %s;", alias, destination, sb.toString()));
         }

         public void cleanup() throws Exception {
           for (String overrideFile: overrideFiles) {
              cluster.delete(new Path(overrideFile));
           }
         }
     }


® 2011 Dachis Group.
dachisgroup.com




Allows You to Rewrite Pig Script
  players = LOAD 'minutes_and_goals' USING BinStorage() AS (
          name: chararray,
          team: chararray,
          minutes: long,
          goals: long
        );

  teams = LOAD 'team_goals' USING BinStorage() AS (
        name: chararray,
        goals: long
      );




                                                      Test input data written to temp files and Pig script rewritten
                                                      to read those files.


                                       players = LOAD ’players-pigunit-input-overridden.txt’ AS (
                                               name: chararray,
                                               team: chararray,
                                               minutes: long,
                                               goals: long
                                             );

                                       teams = LOAD ’teams-pigunit-input-overridden.txt’ AS (
                                             name: chararray,
                                             goals: long
                                           );


® 2011 Dachis Group.
dachisgroup.com




Test Program - Initialization
     public void testSamplePigScript2() throws Exception {
       PigServer pigServer = new PigServer(ExecType.LOCAL);
       Cluster cluster = new Cluster(pigServer.getPigContext());

        String[] args = new String[] {};

        PigTest test = new PigTest
          ("/Users/clintmiller/blah/sampleScript2.pig",
           args, pigServer, cluster);

        InputMocker mocker = new InputMocker(test, pigServer, cluster);




® 2011 Dachis Group.
dachisgroup.com




Test Program – Overriding Inputs
       String[] players = {
          "BenzematReal Madridt2165t20",
          "RonaldotReal Madridt3264t45",
          "FalcaotAtletico Madridt2852t23",
          "MessitBarcelonat3177t50",
          "XavitBarcelonat2079t10",
          "HiguaintReal Madridt1641t22",
          "SancheztBarcelonat1678t12"
       };

       String[] teams = {
          "Barcelonat112",
          "Real Madridt117",
          "Atletico Madridt52"
       };

       mocker.mockInputAlias("players", players);
       mocker.mockInputAlias("teams", teams);




® 2011 Dachis Group.
dachisgroup.com




Test Program – Testing Results
         String[] percentOfTeamGoals = {
            "(Falcao,Atletico Madrid,44)",
            "(Messi,Barcelona,44)",
            "(Ronaldo,Real Madrid,38)",
            "(Higuain,Real Madrid,18)",
            "(Benzema,Real Madrid,17)",
            "(Sanchez,Barcelona,10)",
            "(Xavi,Barcelona,8)"
         };

         test.assertOutput("percent_of_team_goals", percentOfTeamGoals);

         mocker.cleanup();
     }




® 2011 Dachis Group.

Unit testing pig

  • 1.
    dachisgroup.com Dachis Group Las Vegas2012 Pig Unit Testing Clint Miller Pigout Hackday, Austin TX May 11, 2012 ® 2011 Dachis Group.
  • 2.
    dachisgroup.com What is PigUnit? • Not really a *Unit framework. • Library that you can use within your JUnit tests that allows you to • Run your Pig scripts from within your JUnit tests. • Override variables in your Pig scripts so that they get values from your JUnit tests rather than reading external sources such as HDFS. • Inspect the values of your Pig script variables. • Make your STORE statements into no-ops so that your Pig scripts run without side effects. ® 2011 Dachis Group.
  • 3.
    dachisgroup.com Simple Pig Script minutes_and_goals = LOAD 'minutes_and_goals' USING BinStorage() AS ( name: chararray, team: chararray, minutes: long, goals: long ); top_goal_scorers = FILTER minutes_and_goals BY goals >= $MIN_GOALS; minutes_per_goal_unsorted = FOREACH top_goal_scorers GENERATE name, minutes/goals AS minutes_per_goal; minutes_per_goal = ORDER minutes_per_goal_unsorted BY minutes_per_goal; STORE minutes_per_goal INTO 'minutes_per_goal' USING BinStorage(); ® 2011 Dachis Group.
  • 4.
    dachisgroup.com Simple Test Program public void testSamplePigScript() throws Exception { String[] args = { "MIN_GOALS=20" }; PigTest test = new PigTest("/Users/clintmiller/blah/sampleScript.pig", args); String[] input = { "BenzematReal Madridt2165t20", "RonaldotReal Madridt3264t45", "FalcaotAtletico Madridt2852t23", "MessitBarcelonat3177t50", "XavitBarcelonat2079t10", "HiguaintReal Madridt1641t22", "SancheztBarcelonat1678t12" }; String[] expectedOutput = { "(Messi,63)", "(Ronaldo,72)", "(Higuain,74)", "(Benzema,108)", "(Falcao,124)" }; test.assertOutput("minutes_and_goals", input, "minutes_per_goal", expectedOutput); } ® 2011 Dachis Group.
  • 5.
    dachisgroup.com More Complex PigScript (reads two input files) players = LOAD 'minutes_and_goals' USING BinStorage() AS ( name: chararray, team: chararray, minutes: long, goals: long ); teams = LOAD 'team_goals' USING BinStorage() AS ( name: chararray, goals: long ); player_and_team = JOIN players BY team, teams BY name; percent_of_team_goals_unsorted = FOREACH player_and_team GENERATE players::name, teams::name, (players::goals * 100) / teams::goals AS percent_of_team_goals; percent_of_team_goals = ORDER percent_of_team_goals_unsorted BY percent_of_team_goals DESC, teams::name; STORE percent_of_team_goals INTO 'percent_of_team_goals' USING BinStorage(); ® 2011 Dachis Group.
  • 6.
    dachisgroup.com Methods on PigTest Iterator<Tuple> getAlias(String alias); Iterator<Tuple> getAlias(); // Fetches value of last variable used in a STORE command void override(String alias, String query); void unoverride(String alias); void assertOutput(String[] expected); void assertOutput(String alias, String[] expected); void assertOutput(File expected); void assertOutput(String alias, File expected); void assertOutput(String aliasInput, String[] input, String alias, String[] expected); There is no simple way to override the values of multiple input variables! ® 2011 Dachis Group.
  • 7.
    dachisgroup.com Method override() Savesthe Day! public class InputMocker { protected PigTest test; protected PigServer pigServer; protected Cluster cluster; protected List<String> overrideFiles; public InputMocker(PigTest test, PigServer pigServer, Cluster cluster) { this.test = test; this.pigServer = pigServer; this.cluster = cluster; this.overrideFiles = new ArrayList<String>(); } public void mockInputAlias(String alias, String[] input) throws Exception { test.runScript(); StringBuilder sb = new StringBuilder(); Schema.stringifySchema(sb, pigServer.dumpSchema(alias), DataType.TUPLE); String destination = alias + "-pigunit-input-overridden.txt"; overrideFiles.add(destination); cluster.copyFromLocalFile(input, destination, true); test.override(alias, String.format("%s = LOAD '%s' AS %s;", alias, destination, sb.toString())); } public void cleanup() throws Exception { for (String overrideFile: overrideFiles) { cluster.delete(new Path(overrideFile)); } } } ® 2011 Dachis Group.
  • 8.
    dachisgroup.com Allows You toRewrite Pig Script players = LOAD 'minutes_and_goals' USING BinStorage() AS ( name: chararray, team: chararray, minutes: long, goals: long ); teams = LOAD 'team_goals' USING BinStorage() AS ( name: chararray, goals: long ); Test input data written to temp files and Pig script rewritten to read those files. players = LOAD ’players-pigunit-input-overridden.txt’ AS ( name: chararray, team: chararray, minutes: long, goals: long ); teams = LOAD ’teams-pigunit-input-overridden.txt’ AS ( name: chararray, goals: long ); ® 2011 Dachis Group.
  • 9.
    dachisgroup.com Test Program -Initialization public void testSamplePigScript2() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Cluster cluster = new Cluster(pigServer.getPigContext()); String[] args = new String[] {}; PigTest test = new PigTest ("/Users/clintmiller/blah/sampleScript2.pig", args, pigServer, cluster); InputMocker mocker = new InputMocker(test, pigServer, cluster); ® 2011 Dachis Group.
  • 10.
    dachisgroup.com Test Program –Overriding Inputs String[] players = { "BenzematReal Madridt2165t20", "RonaldotReal Madridt3264t45", "FalcaotAtletico Madridt2852t23", "MessitBarcelonat3177t50", "XavitBarcelonat2079t10", "HiguaintReal Madridt1641t22", "SancheztBarcelonat1678t12" }; String[] teams = { "Barcelonat112", "Real Madridt117", "Atletico Madridt52" }; mocker.mockInputAlias("players", players); mocker.mockInputAlias("teams", teams); ® 2011 Dachis Group.
  • 11.
    dachisgroup.com Test Program –Testing Results String[] percentOfTeamGoals = { "(Falcao,Atletico Madrid,44)", "(Messi,Barcelona,44)", "(Ronaldo,Real Madrid,38)", "(Higuain,Real Madrid,18)", "(Benzema,Real Madrid,17)", "(Sanchez,Barcelona,10)", "(Xavi,Barcelona,8)" }; test.assertOutput("percent_of_team_goals", percentOfTeamGoals); mocker.cleanup(); } ® 2011 Dachis Group.