Пример #1
0
        public virtual void TestSplitSampler()
        {
            // IntWritable comparator not typesafe
            int TotSplits  = 15;
            int NumSplits  = 5;
            int StepSample = 5;
            int NumSamples = NumSplits * StepSample;

            InputSampler.Sampler <IntWritable, NullWritable> sampler = new InputSampler.SplitSampler
                                                                       <IntWritable, NullWritable>(NumSamples, NumSplits);
            int[] inits = new int[TotSplits];
            for (int i = 0; i < TotSplits; ++i)
            {
                inits[i] = i * StepSample;
            }
            Job ignored = Job.GetInstance();

            object[] samples = sampler.GetSample(new TestInputSampler.TestInputSamplerIF(100000
                                                                                         , TotSplits, inits), ignored);
            NUnit.Framework.Assert.AreEqual(NumSamples, samples.Length);
            Arrays.Sort(samples, new IntWritable.Comparator());
            for (int i_1 = 0; i_1 < NumSamples; ++i_1)
            {
                NUnit.Framework.Assert.AreEqual(i_1, ((IntWritable)samples[i_1]).Get());
            }
        }
Пример #2
0
        /// <summary>
        /// Verify SplitSampler contract in mapred.lib.InputSampler, which is added
        /// back for binary compatibility of M/R 1.x
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestMapredSplitSampler()
        {
            // IntWritable comparator not typesafe
            int TotSplits  = 15;
            int NumSplits  = 5;
            int StepSample = 5;
            int NumSamples = NumSplits * StepSample;

            InputSampler.Sampler <IntWritable, NullWritable> sampler = new InputSampler.SplitSampler
                                                                       <IntWritable, NullWritable>(NumSamples, NumSplits);
            int[] inits = new int[TotSplits];
            for (int i = 0; i < TotSplits; ++i)
            {
                inits[i] = i * StepSample;
            }
            object[] samples = sampler.GetSample(new TestInputSampler.TestMapredInputSamplerIF
                                                     (100000, TotSplits, inits), new JobConf());
            NUnit.Framework.Assert.AreEqual(NumSamples, samples.Length);
            Arrays.Sort(samples, new IntWritable.Comparator());
            for (int i_1 = 0; i_1 < NumSamples; ++i_1)
            {
                // mapred.lib.InputSampler.SplitSampler has a sampling step
                NUnit.Framework.Assert.AreEqual(i_1 % StepSample + TotSplits * (i_1 / StepSample)
                                                , ((IntWritable)samples[i_1]).Get());
            }
        }
Пример #3
0
        /// <summary>Driver for InputSampler from the command line.</summary>
        /// <remarks>
        /// Driver for InputSampler from the command line.
        /// Configures a JobConf instance and calls
        /// <see cref="InputSampler{K, V}.WritePartitionFile{K, V}(Org.Apache.Hadoop.Mapreduce.Job, Sampler{K, V})
        ///     "/>
        /// .
        /// </remarks>
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Job            job       = Job.GetInstance(GetConf());
            AList <string> otherArgs = new AList <string>();

            InputSampler.Sampler <K, V> sampler = null;
            for (int i = 0; i < args.Length; ++i)
            {
                try
                {
                    if ("-r".Equals(args[i]))
                    {
                        job.SetNumReduceTasks(System.Convert.ToInt32(args[++i]));
                    }
                    else
                    {
                        if ("-inFormat".Equals(args[i]))
                        {
                            job.SetInputFormatClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat
                                                                                                   >());
                        }
                        else
                        {
                            if ("-keyClass".Equals(args[i]))
                            {
                                job.SetMapOutputKeyClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable
                                                                                                        >());
                            }
                            else
                            {
                                if ("-splitSample".Equals(args[i]))
                                {
                                    int numSamples = System.Convert.ToInt32(args[++i]);
                                    int maxSplits  = System.Convert.ToInt32(args[++i]);
                                    if (0 >= maxSplits)
                                    {
                                        maxSplits = int.MaxValue;
                                    }
                                    sampler = new InputSampler.SplitSampler <K, V>(numSamples, maxSplits);
                                }
                                else
                                {
                                    if ("-splitRandom".Equals(args[i]))
                                    {
                                        double pcnt       = double.ParseDouble(args[++i]);
                                        int    numSamples = System.Convert.ToInt32(args[++i]);
                                        int    maxSplits  = System.Convert.ToInt32(args[++i]);
                                        if (0 >= maxSplits)
                                        {
                                            maxSplits = int.MaxValue;
                                        }
                                        sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits);
                                    }
                                    else
                                    {
                                        if ("-splitInterval".Equals(args[i]))
                                        {
                                            double pcnt      = double.ParseDouble(args[++i]);
                                            int    maxSplits = System.Convert.ToInt32(args[++i]);
                                            if (0 >= maxSplits)
                                            {
                                                maxSplits = int.MaxValue;
                                            }
                                            sampler = new InputSampler.IntervalSampler <K, V>(pcnt, maxSplits);
                                        }
                                        else
                                        {
                                            otherArgs.AddItem(args[i]);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch (FormatException)
                {
                    System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]);
                    return(PrintUsage());
                }
                catch (IndexOutOfRangeException)
                {
                    System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i -
                                                                                                  1]);
                    return(PrintUsage());
                }
            }
            if (job.GetNumReduceTasks() <= 1)
            {
                System.Console.Error.WriteLine("Sampler requires more than one reducer");
                return(PrintUsage());
            }
            if (otherArgs.Count < 2)
            {
                System.Console.Out.WriteLine("ERROR: Wrong number of parameters: ");
                return(PrintUsage());
            }
            if (null == sampler)
            {
                sampler = new InputSampler.RandomSampler <K, V>(0.1, 10000, 10);
            }
            Path outf = new Path(otherArgs.Remove(otherArgs.Count - 1));

            TotalOrderPartitioner.SetPartitionFile(GetConf(), outf);
            foreach (string s in otherArgs)
            {
                FileInputFormat.AddInputPath(job, new Path(s));
            }
            InputSampler.WritePartitionFile <K, V>(job, sampler);
            return(0);
        }