示例#1
0
        /// <summary>Write a partition file for the given job, using the Sampler provided.</summary>
        /// <remarks>
        /// Write a partition file for the given job, using the Sampler provided.
        /// Queries the sampler for a sample keyset, sorts by the output key
        /// comparator, selects the keys for each rank, and writes to the destination
        /// returned from
        /// <see cref="TotalOrderPartitioner{K, V}.GetPartitionFile(Org.Apache.Hadoop.Conf.Configuration)
        ///     "/>
        /// .
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.Exception"/>
        public static void WritePartitionFile <K, V>(Job job, InputSampler.Sampler <K, V> sampler
                                                     )
        {
            // getInputFormat, getOutputKeyComparator
            Configuration conf          = job.GetConfiguration();
            InputFormat   inf           = ReflectionUtils.NewInstance(job.GetInputFormatClass(), conf);
            int           numPartitions = job.GetNumReduceTasks();

            K[] samples = (K[])sampler.GetSample(inf, job);
            Log.Info("Using " + samples.Length + " samples");
            RawComparator <K> comparator = (RawComparator <K>)job.GetSortComparator();

            Arrays.Sort(samples, comparator);
            Path       dst = new Path(TotalOrderPartitioner.GetPartitionFile(conf));
            FileSystem fs  = dst.GetFileSystem(conf);

            if (fs.Exists(dst))
            {
                fs.Delete(dst, false);
            }
            SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, dst, job.GetMapOutputKeyClass
                                                                       (), typeof(NullWritable));
            NullWritable nullValue = NullWritable.Get();
            float        stepSize  = samples.Length / (float)numPartitions;
            int          last      = -1;

            for (int i = 1; i < numPartitions; ++i)
            {
                int k = Math.Round(stepSize * i);
                while (last >= k && comparator.Compare(samples[last], samples[k]) == 0)
                {
                    ++k;
                }
                writer.Append(samples[k], nullValue);
                last = k;
            }
            writer.Close();
        }
示例#2
0
        /// <summary>The main driver for sort program.</summary>
        /// <remarks>
        /// The main driver for sort program.
        /// Invoke this method to submit the map/reduce job.
        /// </remarks>
        /// <exception cref="System.IO.IOException">
        /// When there is communication problems with the
        /// job tracker.
        /// </exception>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Configuration conf         = GetConf();
            JobClient     client       = new JobClient(conf);
            ClusterStatus cluster      = client.GetClusterStatus();
            int           num_reduces  = (int)(cluster.GetMaxReduceTasks() * 0.9);
            string        sort_reduces = conf.Get(ReducesPerHost);

            if (sort_reduces != null)
            {
                num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sort_reduces);
            }
            Type           inputFormatClass  = typeof(SequenceFileInputFormat);
            Type           outputFormatClass = typeof(SequenceFileOutputFormat);
            Type           outputKeyClass    = typeof(BytesWritable);
            Type           outputValueClass  = typeof(BytesWritable);
            IList <string> otherArgs         = new AList <string>();

            InputSampler.Sampler <K, V> sampler = null;
            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-r".Equals(args[i]))
                    {
                        num_reduces = System.Convert.ToInt32(args[++i]);
                    }
                    else
                    {
                        if ("-inFormat".Equals(args[i]))
                        {
                            inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>();
                        }
                        else
                        {
                            if ("-outFormat".Equals(args[i]))
                            {
                                outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>();
                            }
                            else
                            {
                                if ("-outKey".Equals(args[i]))
                                {
                                    outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable
                                                                                                    >();
                                }
                                else
                                {
                                    if ("-outValue".Equals(args[i]))
                                    {
                                        outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>();
                                    }
                                    else
                                    {
                                        if ("-totalOrder".Equals(args[i]))
                                        {
                                            double pcnt       = double.ParseDouble(args[++i]);
                                            int    numSamples = System.Convert.ToInt32(args[++i]);
                                            int    maxSplits  = System.Convert.ToInt32(args[++i]);
                                            if (0 >= maxSplits)
                                            {
                                                maxSplits = int.MaxValue;
                                            }
                                            sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits);
                                        }
                                        else
                                        {
                                            otherArgs.AddItem(args[i]);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch (FormatException)
                {
                    System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]);
                    return(PrintUsage());
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            // exits
            // Set user-supplied (possibly default) job configs
            job = Job.GetInstance(conf);
            job.SetJobName("sorter");
            job.SetJarByClass(typeof(Sort));
            job.SetMapperClass(typeof(Mapper));
            job.SetReducerClass(typeof(Reducer));
            job.SetNumReduceTasks(num_reduces);
            job.SetInputFormatClass(inputFormatClass);
            job.SetOutputFormatClass(outputFormatClass);
            job.SetOutputKeyClass(outputKeyClass);
            job.SetOutputValueClass(outputValueClass);
            // Make sure there are exactly 2 parameters left.
            if (otherArgs.Count != 2)
            {
                System.Console.Out.WriteLine("ERROR: Wrong number of parameters: " + otherArgs.Count
                                             + " instead of 2.");
                return(PrintUsage());
            }
            FileInputFormat.SetInputPaths(job, otherArgs[0]);
            FileOutputFormat.SetOutputPath(job, new Path(otherArgs[1]));
            if (sampler != null)
            {
                System.Console.Out.WriteLine("Sampling input to effect total-order sort...");
                job.SetPartitionerClass(typeof(TotalOrderPartitioner));
                Path inputDir = FileInputFormat.GetInputPaths(job)[0];
                inputDir = inputDir.MakeQualified(inputDir.GetFileSystem(conf));
                Path partitionFile = new Path(inputDir, "_sortPartitioning");
                TotalOrderPartitioner.SetPartitionFile(conf, partitionFile);
                InputSampler.WritePartitionFile <K, V>(job, sampler);
                URI partitionUri = new URI(partitionFile.ToString() + "#" + "_sortPartitioning");
                DistributedCache.AddCacheFile(partitionUri, conf);
            }
            System.Console.Out.WriteLine("Running on " + cluster.GetTaskTrackers() + " nodes to sort from "
                                         + FileInputFormat.GetInputPaths(job)[0] + " into " + FileOutputFormat.GetOutputPath
                                             (job) + " with " + num_reduces + " reduces.");
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("Job started: " + startTime);
            int      ret      = job.WaitForCompletion(true) ? 0 : 1;
            DateTime end_time = new DateTime();

            System.Console.Out.WriteLine("Job ended: " + end_time);
            System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime
                                                                ()) / 1000 + " seconds.");
            return(ret);
        }
示例#3
0
 /// <exception cref="System.IO.IOException"/>
 /// <exception cref="System.TypeLoadException"/>
 /// <exception cref="System.Exception"/>
 public static void WritePartitionFile <K, V>(JobConf job, InputSampler.Sampler <K,
                                                                                 V> sampler)
 {
     WritePartitionFile(Job.GetInstance(job), sampler);
 }
示例#4
0
        /// <summary>Driver for InputSampler from the command line.</summary>
        /// <remarks>
        /// Driver for InputSampler from the command line.
        /// Configures a JobConf instance and calls
        /// <see cref="InputSampler{K, V}.WritePartitionFile{K, V}(Org.Apache.Hadoop.Mapreduce.Job, Sampler{K, V})
        ///     "/>
        /// .
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Job            job       = Job.GetInstance(GetConf());
            AList <string> otherArgs = new AList <string>();

            InputSampler.Sampler <K, V> sampler = null;
            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-r".Equals(args[i]))
                    {
                        job.SetNumReduceTasks(System.Convert.ToInt32(args[++i]));
                    }
                    else
                    {
                        if ("-inFormat".Equals(args[i]))
                        {
                            job.SetInputFormatClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat
                                                                                                   >());
                        }
                        else
                        {
                            if ("-keyClass".Equals(args[i]))
                            {
                                job.SetMapOutputKeyClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable
                                                                                                        >());
                            }
                            else
                            {
                                if ("-splitSample".Equals(args[i]))
                                {
                                    int numSamples = System.Convert.ToInt32(args[++i]);
                                    int maxSplits  = System.Convert.ToInt32(args[++i]);
                                    if (0 >= maxSplits)
                                    {
                                        maxSplits = int.MaxValue;
                                    }
                                    sampler = new InputSampler.SplitSampler <K, V>(numSamples, maxSplits);
                                }
                                else
                                {
                                    if ("-splitRandom".Equals(args[i]))
                                    {
                                        double pcnt       = double.ParseDouble(args[++i]);
                                        int    numSamples = System.Convert.ToInt32(args[++i]);
                                        int    maxSplits  = System.Convert.ToInt32(args[++i]);
                                        if (0 >= maxSplits)
                                        {
                                            maxSplits = int.MaxValue;
                                        }
                                        sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits);
                                    }
                                    else
                                    {
                                        if ("-splitInterval".Equals(args[i]))
                                        {
                                            double pcnt      = double.ParseDouble(args[++i]);
                                            int    maxSplits = System.Convert.ToInt32(args[++i]);
                                            if (0 >= maxSplits)
                                            {
                                                maxSplits = int.MaxValue;
                                            }
                                            sampler = new InputSampler.IntervalSampler <K, V>(pcnt, maxSplits);
                                        }
                                        else
                                        {
                                            otherArgs.AddItem(args[i]);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch (FormatException)
                {
                    System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]);
                    return(PrintUsage());
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            if (job.GetNumReduceTasks() <= 1)
            {
                System.Console.Error.WriteLine("Sampler requires more than one reducer");
                return(PrintUsage());
            }
            if (otherArgs.Count < 2)
            {
                System.Console.Out.WriteLine("ERROR: Wrong number of parameters: ");
                return(PrintUsage());
            }
            if (null == sampler)
            {
                sampler = new InputSampler.RandomSampler <K, V>(0.1, 10000, 10);
            }
            Path outf = new Path(otherArgs.Remove(otherArgs.Count - 1));

            TotalOrderPartitioner.SetPartitionFile(GetConf(), outf);
            foreach (string s in otherArgs)
            {
                FileInputFormat.AddInputPath(job, new Path(s));
            }
            InputSampler.WritePartitionFile <K, V>(job, sampler);
            return(0);
        }