/// <summary> /// Verify IntervalSampler in mapred.lib.InputSampler, which is added back /// for binary compatibility of M/R 1.x /// </summary> /// <exception cref="System.Exception"/> public virtual void TestMapredIntervalSampler() { // IntWritable comparator not typesafe int TotSplits = 16; int PerSplitSample = 4; int NumSamples = TotSplits * PerSplitSample; double Freq = 1.0 / TotSplits; InputSampler.Sampler <IntWritable, NullWritable> sampler = new InputSampler.IntervalSampler <IntWritable, NullWritable>(Freq, NumSamples); int[] inits = new int[TotSplits]; for (int i = 0; i < TotSplits; ++i) { inits[i] = i; } Job ignored = Job.GetInstance(); object[] samples = sampler.GetSample(new TestInputSampler.TestInputSamplerIF(NumSamples , TotSplits, inits), ignored); NUnit.Framework.Assert.AreEqual(NumSamples, samples.Length); Arrays.Sort(samples, new IntWritable.Comparator()); for (int i_1 = 0; i_1 < NumSamples; ++i_1) { NUnit.Framework.Assert.AreEqual(i_1, ((IntWritable)samples[i_1]).Get()); } }
/// <summary>Driver for InputSampler from the command line.</summary> /// <remarks> /// Driver for InputSampler from the command line. /// Configures a JobConf instance and calls /// <see cref="InputSampler{K, V}.WritePartitionFile{K, V}(Org.Apache.Hadoop.Mapreduce.Job, Sampler{K, V}) /// "/> /// . /// </remarks> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Job job = Job.GetInstance(GetConf()); AList <string> otherArgs = new AList <string>(); InputSampler.Sampler <K, V> sampler = null; for (int i = 0; i < args.Length; ++i) { try { if ("-r".Equals(args[i])) { job.SetNumReduceTasks(System.Convert.ToInt32(args[++i])); } else { if ("-inFormat".Equals(args[i])) { job.SetInputFormatClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat >()); } else { if ("-keyClass".Equals(args[i])) { job.SetMapOutputKeyClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable >()); } else { if ("-splitSample".Equals(args[i])) { int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.SplitSampler <K, V>(numSamples, maxSplits); } else { if ("-splitRandom".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits); } else { if ("-splitInterval".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.IntervalSampler <K, V>(pcnt, maxSplits); } else { otherArgs.AddItem(args[i]); } } } } } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } if (job.GetNumReduceTasks() <= 1) { System.Console.Error.WriteLine("Sampler requires more than one reducer"); return(PrintUsage()); } if (otherArgs.Count < 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: "); return(PrintUsage()); } if (null == sampler) { sampler = new InputSampler.RandomSampler <K, V>(0.1, 10000, 10); } Path outf = new Path(otherArgs.Remove(otherArgs.Count - 1)); TotalOrderPartitioner.SetPartitionFile(GetConf(), outf); foreach (string s in otherArgs) { FileInputFormat.AddInputPath(job, new Path(s)); } InputSampler.WritePartitionFile <K, V>(job, sampler); return(0); }