/// <summary>Write a partition file for the given job, using the Sampler provided.</summary> /// <remarks> /// Write a partition file for the given job, using the Sampler provided. /// Queries the sampler for a sample keyset, sorts by the output key /// comparator, selects the keys for each rank, and writes to the destination /// returned from /// <see cref="TotalOrderPartitioner{K, V}.GetPartitionFile(Org.Apache.Hadoop.Conf.Configuration) /// "/> /// . /// </remarks> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="System.Exception"/> public static void WritePartitionFile <K, V>(Job job, InputSampler.Sampler <K, V> sampler ) { // getInputFormat, getOutputKeyComparator Configuration conf = job.GetConfiguration(); InputFormat inf = ReflectionUtils.NewInstance(job.GetInputFormatClass(), conf); int numPartitions = job.GetNumReduceTasks(); K[] samples = (K[])sampler.GetSample(inf, job); Log.Info("Using " + samples.Length + " samples"); RawComparator <K> comparator = (RawComparator <K>)job.GetSortComparator(); Arrays.Sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.GetPartitionFile(conf)); FileSystem fs = dst.GetFileSystem(conf); if (fs.Exists(dst)) { fs.Delete(dst, false); } SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, dst, job.GetMapOutputKeyClass (), typeof(NullWritable)); NullWritable nullValue = NullWritable.Get(); float stepSize = samples.Length / (float)numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.Round(stepSize * i); while (last >= k && comparator.Compare(samples[last], samples[k]) == 0) { ++k; } writer.Append(samples[k], nullValue); last = k; } writer.Close(); }
/// <summary>The main driver for sort program.</summary> /// <remarks> /// The main driver for sort program. /// Invoke this method to submit the map/reduce job. /// </remarks> /// <exception cref="System.IO.IOException"> /// When there is communication problems with the /// job tracker. /// </exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Configuration conf = GetConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.GetClusterStatus(); int num_reduces = (int)(cluster.GetMaxReduceTasks() * 0.9); string sort_reduces = conf.Get(ReducesPerHost); if (sort_reduces != null) { num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sort_reduces); } Type inputFormatClass = typeof(SequenceFileInputFormat); Type outputFormatClass = typeof(SequenceFileOutputFormat); Type outputKeyClass = typeof(BytesWritable); Type outputValueClass = typeof(BytesWritable); IList <string> otherArgs = new AList <string>(); InputSampler.Sampler <K, V> sampler = null; for (int i = 0; i < args.Length; ++i) { try { if ("-r".Equals(args[i])) { num_reduces = System.Convert.ToInt32(args[++i]); } else { if ("-inFormat".Equals(args[i])) { inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>(); } else { if ("-outFormat".Equals(args[i])) { outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>(); } else { if ("-outKey".Equals(args[i])) { outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable >(); } else { if ("-outValue".Equals(args[i])) { outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>(); } else { if ("-totalOrder".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.AddItem(args[i]); } } } } } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } // exits // Set user-supplied (possibly default) job configs job = Job.GetInstance(conf); job.SetJobName("sorter"); job.SetJarByClass(typeof(Sort)); job.SetMapperClass(typeof(Mapper)); job.SetReducerClass(typeof(Reducer)); job.SetNumReduceTasks(num_reduces); job.SetInputFormatClass(inputFormatClass); job.SetOutputFormatClass(outputFormatClass); job.SetOutputKeyClass(outputKeyClass); job.SetOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.Count != 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: " + otherArgs.Count + " instead of 2."); return(PrintUsage()); } FileInputFormat.SetInputPaths(job, otherArgs[0]); FileOutputFormat.SetOutputPath(job, new Path(otherArgs[1])); if (sampler != null) { System.Console.Out.WriteLine("Sampling input to effect total-order sort..."); job.SetPartitionerClass(typeof(TotalOrderPartitioner)); Path inputDir = FileInputFormat.GetInputPaths(job)[0]; inputDir = inputDir.MakeQualified(inputDir.GetFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.SetPartitionFile(conf, partitionFile); InputSampler.WritePartitionFile <K, V>(job, sampler); URI partitionUri = new URI(partitionFile.ToString() + "#" + "_sortPartitioning"); DistributedCache.AddCacheFile(partitionUri, conf); } System.Console.Out.WriteLine("Running on " + cluster.GetTaskTrackers() + " nodes to sort from " + FileInputFormat.GetInputPaths(job)[0] + " into " + FileOutputFormat.GetOutputPath (job) + " with " + num_reduces + " reduces."); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); int ret = job.WaitForCompletion(true) ? 0 : 1; DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(ret); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="System.Exception"/> public static void WritePartitionFile <K, V>(JobConf job, InputSampler.Sampler <K, V> sampler) { WritePartitionFile(Job.GetInstance(job), sampler); }
/// <summary>Driver for InputSampler from the command line.</summary> /// <remarks> /// Driver for InputSampler from the command line. /// Configures a JobConf instance and calls /// <see cref="InputSampler{K, V}.WritePartitionFile{K, V}(Org.Apache.Hadoop.Mapreduce.Job, Sampler{K, V}) /// "/> /// . /// </remarks> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Job job = Job.GetInstance(GetConf()); AList <string> otherArgs = new AList <string>(); InputSampler.Sampler <K, V> sampler = null; for (int i = 0; i < args.Length; ++i) { try { if ("-r".Equals(args[i])) { job.SetNumReduceTasks(System.Convert.ToInt32(args[++i])); } else { if ("-inFormat".Equals(args[i])) { job.SetInputFormatClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat >()); } else { if ("-keyClass".Equals(args[i])) { job.SetMapOutputKeyClass(Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable >()); } else { if ("-splitSample".Equals(args[i])) { int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.SplitSampler <K, V>(numSamples, maxSplits); } else { if ("-splitRandom".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits); } else { if ("-splitInterval".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.IntervalSampler <K, V>(pcnt, maxSplits); } else { otherArgs.AddItem(args[i]); } } } } } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } if (job.GetNumReduceTasks() <= 1) { System.Console.Error.WriteLine("Sampler requires more than one reducer"); return(PrintUsage()); } if (otherArgs.Count < 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: "); return(PrintUsage()); } if (null == sampler) { sampler = new InputSampler.RandomSampler <K, V>(0.1, 10000, 10); } Path outf = new Path(otherArgs.Remove(otherArgs.Count - 1)); TotalOrderPartitioner.SetPartitionFile(GetConf(), outf); foreach (string s in otherArgs) { FileInputFormat.AddInputPath(job, new Path(s)); } InputSampler.WritePartitionFile <K, V>(job, sampler); return(0); }