/// <summary>This is the main routine for launching a distributed random write job.</summary> /// <remarks> /// This is the main routine for launching a distributed random write job. /// It runs 10 maps/node and each node writes 1 gig of data to a DFS file. /// The reduce doesn't do anything. /// </remarks> /// <exception cref="System.IO.IOException"></exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { if (args.Length == 0) { System.Console.Out.WriteLine("Usage: writer <out-dir>"); ToolRunner.PrintGenericCommandUsage(System.Console.Out); return(2); } Path outDir = new Path(args[0]); Configuration conf = GetConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.GetClusterStatus(); int numMapsPerHost = conf.GetInt(MapsPerHost, 10); long numBytesToWritePerMap = conf.GetLong(BytesPerMap, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.Console.Error.WriteLine("Cannot have" + BytesPerMap + " set to 0"); return(-2); } long totalBytesToWrite = conf.GetLong(TotalBytes, numMapsPerHost * numBytesToWritePerMap * cluster.GetTaskTrackers()); int numMaps = (int)(totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.SetLong(BytesPerMap, totalBytesToWrite); } conf.SetInt(MRJobConfig.NumMaps, numMaps); Job job = Job.GetInstance(conf); job.SetJarByClass(typeof(RandomWriter)); job.SetJobName("random-writer"); FileOutputFormat.SetOutputPath(job, outDir); job.SetOutputKeyClass(typeof(BytesWritable)); job.SetOutputValueClass(typeof(BytesWritable)); job.SetInputFormatClass(typeof(RandomWriter.RandomInputFormat)); job.SetMapperClass(typeof(RandomWriter.RandomMapper)); job.SetReducerClass(typeof(Reducer)); job.SetOutputFormatClass(typeof(SequenceFileOutputFormat)); System.Console.Out.WriteLine("Running " + numMaps + " maps."); // reducer NONE job.SetNumReduceTasks(0); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); int ret = job.WaitForCompletion(true) ? 0 : 1; DateTime endTime = new DateTime(); System.Console.Out.WriteLine("Job ended: " + endTime); System.Console.Out.WriteLine("The job took " + (endTime.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(ret); }
/// <summary>This is the main routine for launching a distributed random write job.</summary> /// <remarks> /// This is the main routine for launching a distributed random write job. /// It runs 10 maps/node and each node writes 1 gig of data to a DFS file. /// The reduce doesn't do anything. /// </remarks> /// <exception cref="System.IO.IOException"></exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { if (args.Length == 0) { return(PrintUsage()); } Configuration conf = GetConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.GetClusterStatus(); int numMapsPerHost = conf.GetInt(MapsPerHost, 10); long numBytesToWritePerMap = conf.GetLong(BytesPerMap, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.Console.Error.WriteLine("Cannot have " + BytesPerMap + " set to 0"); return(-2); } long totalBytesToWrite = conf.GetLong(TotalBytes, numMapsPerHost * numBytesToWritePerMap * cluster.GetTaskTrackers()); int numMaps = (int)(totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.SetLong(BytesPerMap, totalBytesToWrite); } conf.SetInt(MRJobConfig.NumMaps, numMaps); Job job = Job.GetInstance(conf); job.SetJarByClass(typeof(RandomTextWriter)); job.SetJobName("random-text-writer"); job.SetOutputKeyClass(typeof(Org.Apache.Hadoop.IO.Text)); job.SetOutputValueClass(typeof(Org.Apache.Hadoop.IO.Text)); job.SetInputFormatClass(typeof(RandomWriter.RandomInputFormat)); job.SetMapperClass(typeof(RandomTextWriter.RandomTextMapper)); Type outputFormatClass = typeof(SequenceFileOutputFormat); IList <string> otherArgs = new AList <string>(); for (int i = 0; i < args.Length; ++i) { try { if ("-outFormat".Equals(args[i])) { outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>(); } else { otherArgs.AddItem(args[i]); } } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } // exits job.SetOutputFormatClass(outputFormatClass); FileOutputFormat.SetOutputPath(job, new Path(otherArgs[0])); System.Console.Out.WriteLine("Running " + numMaps + " maps."); // reducer NONE job.SetNumReduceTasks(0); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); int ret = job.WaitForCompletion(true) ? 0 : 1; DateTime endTime = new DateTime(); System.Console.Out.WriteLine("Job ended: " + endTime); System.Console.Out.WriteLine("The job took " + (endTime.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(ret); }
/// <summary>The main driver for sort program.</summary> /// <remarks> /// The main driver for sort program. /// Invoke this method to submit the map/reduce job. /// </remarks> /// <exception cref="System.IO.IOException"> /// When there is communication problems with the /// job tracker. /// </exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Configuration conf = GetConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.GetClusterStatus(); int num_reduces = (int)(cluster.GetMaxReduceTasks() * 0.9); string join_reduces = conf.Get(ReducesPerHost); if (join_reduces != null) { num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(join_reduces); } Job job = Job.GetInstance(conf); job.SetJobName("join"); job.SetJarByClass(typeof(Sort)); job.SetMapperClass(typeof(Mapper)); job.SetReducerClass(typeof(Reducer)); Type inputFormatClass = typeof(SequenceFileInputFormat); Type outputFormatClass = typeof(SequenceFileOutputFormat); Type outputKeyClass = typeof(BytesWritable); Type outputValueClass = typeof(TupleWritable); string op = "inner"; IList <string> otherArgs = new AList <string>(); for (int i = 0; i < args.Length; ++i) { try { if ("-r".Equals(args[i])) { num_reduces = System.Convert.ToInt32(args[++i]); } else { if ("-inFormat".Equals(args[i])) { inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>(); } else { if ("-outFormat".Equals(args[i])) { outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>(); } else { if ("-outKey".Equals(args[i])) { outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable >(); } else { if ("-outValue".Equals(args[i])) { outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>(); } else { if ("-joinOp".Equals(args[i])) { op = args[++i]; } else { otherArgs.AddItem(args[i]); } } } } } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } // exits // Set user-supplied (possibly default) job configs job.SetNumReduceTasks(num_reduces); if (otherArgs.Count < 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: "); return(PrintUsage()); } FileOutputFormat.SetOutputPath(job, new Path(otherArgs.Remove(otherArgs.Count - 1 ))); IList <Path> plist = new AList <Path>(otherArgs.Count); foreach (string s in otherArgs) { plist.AddItem(new Path(s)); } job.SetInputFormatClass(typeof(CompositeInputFormat)); job.GetConfiguration().Set(CompositeInputFormat.JoinExpr, CompositeInputFormat.Compose (op, inputFormatClass, Sharpen.Collections.ToArray(plist, new Path[0]))); job.SetOutputFormatClass(outputFormatClass); job.SetOutputKeyClass(outputKeyClass); job.SetOutputValueClass(outputValueClass); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); int ret = job.WaitForCompletion(true) ? 0 : 1; DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(ret); }
/// <summary>The main driver for sort program.</summary> /// <remarks> /// The main driver for sort program. /// Invoke this method to submit the map/reduce job. /// </remarks> /// <exception cref="System.IO.IOException"> /// When there is communication problems with the /// job tracker. /// </exception> /// <exception cref="System.Exception"/> public virtual int Run(string[] args) { Configuration conf = GetConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.GetClusterStatus(); int num_reduces = (int)(cluster.GetMaxReduceTasks() * 0.9); string sort_reduces = conf.Get(ReducesPerHost); if (sort_reduces != null) { num_reduces = cluster.GetTaskTrackers() * System.Convert.ToInt32(sort_reduces); } Type inputFormatClass = typeof(SequenceFileInputFormat); Type outputFormatClass = typeof(SequenceFileOutputFormat); Type outputKeyClass = typeof(BytesWritable); Type outputValueClass = typeof(BytesWritable); IList <string> otherArgs = new AList <string>(); InputSampler.Sampler <K, V> sampler = null; for (int i = 0; i < args.Length; ++i) { try { if ("-r".Equals(args[i])) { num_reduces = System.Convert.ToInt32(args[++i]); } else { if ("-inFormat".Equals(args[i])) { inputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <InputFormat>(); } else { if ("-outFormat".Equals(args[i])) { outputFormatClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <OutputFormat>(); } else { if ("-outKey".Equals(args[i])) { outputKeyClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <WritableComparable >(); } else { if ("-outValue".Equals(args[i])) { outputValueClass = Sharpen.Runtime.GetType(args[++i]).AsSubclass <Writable>(); } else { if ("-totalOrder".Equals(args[i])) { double pcnt = double.ParseDouble(args[++i]); int numSamples = System.Convert.ToInt32(args[++i]); int maxSplits = System.Convert.ToInt32(args[++i]); if (0 >= maxSplits) { maxSplits = int.MaxValue; } sampler = new InputSampler.RandomSampler <K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.AddItem(args[i]); } } } } } } } catch (FormatException) { System.Console.Out.WriteLine("ERROR: Integer expected instead of " + args[i]); return(PrintUsage()); } catch (IndexOutOfRangeException) { System.Console.Out.WriteLine("ERROR: Required parameter missing from " + args[i - 1]); return(PrintUsage()); } } // exits // Set user-supplied (possibly default) job configs job = Job.GetInstance(conf); job.SetJobName("sorter"); job.SetJarByClass(typeof(Sort)); job.SetMapperClass(typeof(Mapper)); job.SetReducerClass(typeof(Reducer)); job.SetNumReduceTasks(num_reduces); job.SetInputFormatClass(inputFormatClass); job.SetOutputFormatClass(outputFormatClass); job.SetOutputKeyClass(outputKeyClass); job.SetOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.Count != 2) { System.Console.Out.WriteLine("ERROR: Wrong number of parameters: " + otherArgs.Count + " instead of 2."); return(PrintUsage()); } FileInputFormat.SetInputPaths(job, otherArgs[0]); FileOutputFormat.SetOutputPath(job, new Path(otherArgs[1])); if (sampler != null) { System.Console.Out.WriteLine("Sampling input to effect total-order sort..."); job.SetPartitionerClass(typeof(TotalOrderPartitioner)); Path inputDir = FileInputFormat.GetInputPaths(job)[0]; inputDir = inputDir.MakeQualified(inputDir.GetFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.SetPartitionFile(conf, partitionFile); InputSampler.WritePartitionFile <K, V>(job, sampler); URI partitionUri = new URI(partitionFile.ToString() + "#" + "_sortPartitioning"); DistributedCache.AddCacheFile(partitionUri, conf); } System.Console.Out.WriteLine("Running on " + cluster.GetTaskTrackers() + " nodes to sort from " + FileInputFormat.GetInputPaths(job)[0] + " into " + FileOutputFormat.GetOutputPath (job) + " with " + num_reduces + " reduces."); DateTime startTime = new DateTime(); System.Console.Out.WriteLine("Job started: " + startTime); int ret = job.WaitForCompletion(true) ? 0 : 1; DateTime end_time = new DateTime(); System.Console.Out.WriteLine("Job ended: " + end_time); System.Console.Out.WriteLine("The job took " + (end_time.GetTime() - startTime.GetTime ()) / 1000 + " seconds."); return(ret); }