Ejemplo n.º 1
0
        /// <summary>This is the main routine for launching a distributed random write job.</summary>
        /// <remarks>
        /// This is the main routine for launching a distributed random write job.
        /// It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
        /// The reduce doesn't do anything.
        /// </remarks>
        /// <exception cref="System.IO.IOException"></exception>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            if (args.Length == 0)
            {
                System.Console.Out.WriteLine("Usage: writer <out-dir>");
                ToolRunner.PrintGenericCommandUsage(System.Console.Out);
                return(2);
            }
            Path          outDir                = new Path(args[0]);
            Configuration conf                  = GetConf();
            JobClient     client                = new JobClient(conf);
            ClusterStatus cluster               = client.GetClusterStatus();
            int           numMapsPerHost        = conf.GetInt(MapsPerHost, 10);
            long          numBytesToWritePerMap = conf.GetLong(BytesPerMap, 1 * 1024 * 1024 * 1024);

            if (numBytesToWritePerMap == 0)
            {
                System.Console.Error.WriteLine("Cannot have" + BytesPerMap + " set to 0");
                return(-2);
            }
            long totalBytesToWrite = conf.GetLong(TotalBytes, numMapsPerHost * numBytesToWritePerMap
                                                  * cluster.GetTaskTrackers());
            int numMaps = (int)(totalBytesToWrite / numBytesToWritePerMap);

            if (numMaps == 0 && totalBytesToWrite > 0)
            {
                numMaps = 1;
                conf.SetLong(BytesPerMap, totalBytesToWrite);
            }
            conf.SetInt(MRJobConfig.NumMaps, numMaps);
            Job job = Job.GetInstance(conf);

            job.SetJarByClass(typeof(RandomWriter));
            job.SetJobName("random-writer");
            FileOutputFormat.SetOutputPath(job, outDir);
            job.SetOutputKeyClass(typeof(BytesWritable));
            job.SetOutputValueClass(typeof(BytesWritable));
            job.SetInputFormatClass(typeof(RandomWriter.RandomInputFormat));
            job.SetMapperClass(typeof(RandomWriter.RandomMapper));
            job.SetReducerClass(typeof(Reducer));
            job.SetOutputFormatClass(typeof(SequenceFileOutputFormat));
            System.Console.Out.WriteLine("Running " + numMaps + " maps.");
            // reducer NONE
            job.SetNumReduceTasks(0);
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("Job started: " + startTime);
            int      ret     = job.WaitForCompletion(true) ? 0 : 1;
            DateTime endTime = new DateTime();

            System.Console.Out.WriteLine("Job ended: " + endTime);
            System.Console.Out.WriteLine("The job took " + (endTime.GetTime() - startTime.GetTime
                                                                ()) / 1000 + " seconds.");
            return(ret);
        }
Ejemplo n.º 2
0
        /// <summary>This is the main routine for launching a distributed random write job.</summary>
        /// <remarks>
        /// This is the main routine for launching a distributed random write job.
        /// It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
        /// The reduce doesn't do anything.
        /// </remarks>
        /// <exception cref="System.IO.IOException"></exception>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            if (args.Length == 0)
            {
                return(PrintUsage());
            }
            Configuration conf                  = GetConf();
            JobClient     client                = new JobClient(conf);
            ClusterStatus cluster               = client.GetClusterStatus();
            int           numMapsPerHost        = conf.GetInt(MapsPerHost, 10);
            long          numBytesToWritePerMap = conf.GetLong(BytesPerMap, 1 * 1024 * 1024 * 1024);

            if (numBytesToWritePerMap == 0)
            {
                System.Console.Error.WriteLine("Cannot have " + BytesPerMap + " set to 0");
                return(-2);
            }
            long totalBytesToWrite = conf.GetLong(TotalBytes, numMapsPerHost * numBytesToWritePerMap
                                                  * cluster.GetTaskTrackers());
            int numMaps = (int)(totalBytesToWrite / numBytesToWritePerMap);

            if (numMaps == 0 && totalBytesToWrite > 0)
            {
                numMaps = 1;
                conf.SetLong(BytesPerMap, totalBytesToWrite);
            }
            conf.SetInt(MRJobConfig.NumMaps, numMaps);
            Job job = Job.GetInstance(conf);

            job.SetJarByClass(typeof(RandomTextWriter));
            job.SetJobName("random-text-writer");
            job.SetOutputKeyClass(typeof(Org.Apache.Hadoop.IO.Text));
            job.SetOutputValueClass(typeof(Org.Apache.Hadoop.IO.Text));
            job.SetInputFormatClass(typeof(RandomWriter.RandomInputFormat));
            job.SetMapperClass(typeof(RandomTextWriter.RandomTextMapper));
            Type           outputFormatClass = typeof(SequenceFileOutputFormat);
            IList <string> otherArgs         = new AList <string>();

            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-outFormat".Equals(args[i]))
                    {
                        outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>();
                    }
                    else
                    {
                        otherArgs.AddItem(args[i]);
                    }
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            // exits
            job.SetOutputFormatClass(outputFormatClass);
            FileOutputFormat.SetOutputPath(job, new Path(otherArgs[0]));
            System.Console.Out.WriteLine("Running " + numMaps + " maps.");
            // reducer NONE
            job.SetNumReduceTasks(0);
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("Job started: " + startTime);
            int      ret     = job.WaitForCompletion(true) ? 0 : 1;
            DateTime endTime = new DateTime();

            System.Console.Out.WriteLine("Job ended: " + endTime);
            System.Console.Out.WriteLine("The job took " + (endTime.GetTime() - startTime.GetTime
                                                                ()) / 1000 + " seconds.");
            return(ret);
        }
Ejemplo n.º 3
0
        /// <summary>The main driver for sort program.</summary>
        /// <remarks>
        /// The main driver for sort program.
        /// Invoke this method to submit the map/reduce job.
        /// </remarks>
        /// <exception cref="System.IO.IOException">
        /// When there is communication problems with the
        /// job tracker.
        /// </exception>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Configuration conf         = GetConf();
            JobClient     client       = new JobClient(conf);
            ClusterStatus cluster      = client.GetClusterStatus();
            int           num_reduces  = (int)(cluster.GetMaxReduceTasks() * 0.9);
            string        join_reduces = conf.Get(ReducesPerHost);

            if (join_reduces != null)
            {
                num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(join_reduces);
            }
            Job job = Job.GetInstance(conf);

            job.SetJobName("join");
            job.SetJarByClass(typeof(Sort));
            job.SetMapperClass(typeof(Mapper));
            job.SetReducerClass(typeof(Reducer));
            Type           inputFormatClass  = typeof(SequenceFileInputFormat);
            Type           outputFormatClass = typeof(SequenceFileOutputFormat);
            Type           outputKeyClass    = typeof(BytesWritable);
            Type           outputValueClass  = typeof(TupleWritable);
            string         op        = "inner";
            IList <string> otherArgs = new AList <string>();

            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-r".Equals(args[i]))
                    {
                        num_reduces = System.Convert.ToInt32(args[++i]);
                    }
                    else
                    {
                        if ("-inFormat".Equals(args[i]))
                        {
                            inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>();
                        }
                        else
                        {
                            if ("-outFormat".Equals(args[i]))
                            {
                                outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>();
                            }
                            else
                            {
                                if ("-outKey".Equals(args[i]))
                                {
                                    outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable
                                                                                                    >();
                                }
                                else
                                {
                                    if ("-outValue".Equals(args[i]))
                                    {
                                        outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>();
                                    }
                                    else
                                    {
                                        if ("-joinOp".Equals(args[i]))
                                        {
                                            op = args[++i];
                                        }
                                        else
                                        {
                                            otherArgs.AddItem(args[i]);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch (FormatException)
                {
                    System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]);
                    return(PrintUsage());
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            // exits
            // Set user-supplied (possibly default) job configs
            job.SetNumReduceTasks(num_reduces);
            if (otherArgs.Count < 2)
            {
                System.Console.Out.WriteLine("ERROR: Wrong number of parameters: ");
                return(PrintUsage());
            }
            FileOutputFormat.SetOutputPath(job, new Path(otherArgs.Remove(otherArgs.Count - 1
                                                                          )));
            IList <Path> plist = new AList <Path>(otherArgs.Count);

            foreach (string s in otherArgs)
            {
                plist.AddItem(new Path(s));
            }
            job.SetInputFormatClass(typeof(CompositeInputFormat));
            job.GetConfiguration().Set(CompositeInputFormat.JoinExpr, CompositeInputFormat.Compose
                                           (op, inputFormatClass, Sharpen.Collections.ToArray(plist, new Path[0])));
            job.SetOutputFormatClass(outputFormatClass);
            job.SetOutputKeyClass(outputKeyClass);
            job.SetOutputValueClass(outputValueClass);
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("Job started: " + startTime);
            int      ret      = job.WaitForCompletion(true) ? 0 : 1;
            DateTime end_time = new DateTime();

            System.Console.Out.WriteLine("Job ended: " + end_time);
            System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime
                                                                ()) / 1000 + " seconds.");
            return(ret);
        }
Ejemplo n.º 4
0
        /// <summary>The main driver for sort program.</summary>
        /// <remarks>
        /// The main driver for sort program.
        /// Invoke this method to submit the map/reduce job.
        /// </remarks>
        /// <exception cref="System.IO.IOException">
        /// When there is communication problems with the
        /// job tracker.
        /// </exception>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Configuration conf         = GetConf();
            JobClient     client       = new JobClient(conf);
            ClusterStatus cluster      = client.GetClusterStatus();
            int           num_reduces  = (int)(cluster.GetMaxReduceTasks() * 0.9);
            string        sort_reduces = conf.Get(ReducesPerHost);

            if (sort_reduces != null)
            {
                num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sort_reduces);
            }
            Type           inputFormatClass  = typeof(SequenceFileInputFormat);
            Type           outputFormatClass = typeof(SequenceFileOutputFormat);
            Type           outputKeyClass    = typeof(BytesWritable);
            Type           outputValueClass  = typeof(BytesWritable);
            IList <string> otherArgs         = new AList <string>();

            InputSampler.Sampler <K, V> sampler = null;
            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-r".Equals(args[i]))
                    {
                        num_reduces = System.Convert.ToInt32(args[++i]);
                    }
                    else
                    {
                        if ("-inFormat".Equals(args[i]))
                        {
                            inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>();
                        }
                        else
                        {
                            if ("-outFormat".Equals(args[i]))
                            {
                                outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>();
                            }
                            else
                            {
                                if ("-outKey".Equals(args[i]))
                                {
                                    outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable
                                                                                                    >();
                                }
                                else
                                {
                                    if ("-outValue".Equals(args[i]))
                                    {
                                        outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>();
                                    }
                                    else
                                    {
                                        if ("-totalOrder".Equals(args[i]))
                                        {
                                            double pcnt       = double.ParseDouble(args[++i]);
                                            int    numSamples = System.Convert.ToInt32(args[++i]);
                                            int    maxSplits  = System.Convert.ToInt32(args[++i]);
                                            if (0 >= maxSplits)
                                            {
                                                maxSplits = int.MaxValue;
                                            }
                                            sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits);
                                        }
                                        else
                                        {
                                            otherArgs.AddItem(args[i]);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch (FormatException)
                {
                    System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]);
                    return(PrintUsage());
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            // exits
            // Set user-supplied (possibly default) job configs
            job = Job.GetInstance(conf);
            job.SetJobName("sorter");
            job.SetJarByClass(typeof(Sort));
            job.SetMapperClass(typeof(Mapper));
            job.SetReducerClass(typeof(Reducer));
            job.SetNumReduceTasks(num_reduces);
            job.SetInputFormatClass(inputFormatClass);
            job.SetOutputFormatClass(outputFormatClass);
            job.SetOutputKeyClass(outputKeyClass);
            job.SetOutputValueClass(outputValueClass);
            // Make sure there are exactly 2 parameters left.
            if (otherArgs.Count != 2)
            {
                System.Console.Out.WriteLine("ERROR: Wrong number of parameters: " + otherArgs.Count
                                             + " instead of 2.");
                return(PrintUsage());
            }
            FileInputFormat.SetInputPaths(job, otherArgs[0]);
            FileOutputFormat.SetOutputPath(job, new Path(otherArgs[1]));
            if (sampler != null)
            {
                System.Console.Out.WriteLine("Sampling input to effect total-order sort...");
                job.SetPartitionerClass(typeof(TotalOrderPartitioner));
                Path inputDir = FileInputFormat.GetInputPaths(job)[0];
                inputDir = inputDir.MakeQualified(inputDir.GetFileSystem(conf));
                Path partitionFile = new Path(inputDir, "_sortPartitioning");
                TotalOrderPartitioner.SetPartitionFile(conf, partitionFile);
                InputSampler.WritePartitionFile <K, V>(job, sampler);
                URI partitionUri = new URI(partitionFile.ToString() + "#" + "_sortPartitioning");
                DistributedCache.AddCacheFile(partitionUri, conf);
            }
            System.Console.Out.WriteLine("Running on " + cluster.GetTaskTrackers() + " nodes to sort from "
                                         + FileInputFormat.GetInputPaths(job)[0] + " into " + FileOutputFormat.GetOutputPath
                                             (job) + " with " + num_reduces + " reduces.");
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("Job started: " + startTime);
            int      ret      = job.WaitForCompletion(true) ? 0 : 1;
            DateTime end_time = new DateTime();

            System.Console.Out.WriteLine("Job ended: " + end_time);
            System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime
                                                                ()) / 1000 + " seconds.");
            return(ret);
        }