コード例 #1
0
        /// <summary>
        /// Use the input splits to take samples of the input and generate sample
        /// keys.
        /// </summary>
        /// <remarks>
        /// Use the input splits to take samples of the input and generate sample
        /// keys. By default reads 100,000 keys from 10 locations in the input, sorts
        /// them and picks N-1 keys to generate N equally sized partitions.
        /// </remarks>
        /// <param name="job">the job to sample</param>
        /// <param name="partFile">where to write the output file to</param>
        /// <exception cref="System.Exception">if something goes wrong</exception>
        public static void WritePartitionFile(JobContext job, Path partFile)
        {
            long            t1       = Runtime.CurrentTimeMillis();
            Configuration   conf     = job.GetConfiguration();
            TeraInputFormat inFormat = new TeraInputFormat();

            TeraInputFormat.TextSampler sampler = new TeraInputFormat.TextSampler();
            int  partitions           = job.GetNumReduceTasks();
            long sampleSize           = conf.GetLong(SampleSize, 100000);
            IList <InputSplit> splits = inFormat.GetSplits(job);
            long t2 = Runtime.CurrentTimeMillis();

            System.Console.Out.WriteLine("Computing input splits took " + (t2 - t1) + "ms");
            int samples = Math.Min(conf.GetInt(NumPartitions, 10), splits.Count);

            System.Console.Out.WriteLine("Sampling " + samples + " splits of " + splits.Count
                                         );
            long recordsPerSample = sampleSize / samples;
            int  sampleStep       = splits.Count / samples;

            Sharpen.Thread[] samplerReader = new Sharpen.Thread[samples];
            TeraInputFormat.SamplerThreadGroup threadGroup = new TeraInputFormat.SamplerThreadGroup
                                                                 ("Sampler Reader Thread Group");
            // take N samples from different parts of the input
            for (int i = 0; i < samples; ++i)
            {
                int idx = i;
                samplerReader[i] = new _Thread_140(job, inFormat, splits, sampleStep, idx, sampler
                                                   , recordsPerSample, threadGroup, "Sampler Reader " + idx);
                samplerReader[i].Start();
            }
            FileSystem       outFs  = partFile.GetFileSystem(conf);
            DataOutputStream writer = outFs.Create(partFile, true, 64 * 1024, (short)10, outFs
                                                   .GetDefaultBlockSize(partFile));

            for (int i_1 = 0; i_1 < samples; i_1++)
            {
                try
                {
                    samplerReader[i_1].Join();
                    if (threadGroup.GetThrowable() != null)
                    {
                        throw threadGroup.GetThrowable();
                    }
                }
                catch (Exception)
                {
                }
            }
            foreach (Text split in sampler.CreatePartitions(partitions))
            {
                split.Write(writer);
            }
            writer.Close();
            long t3 = Runtime.CurrentTimeMillis();

            System.Console.Out.WriteLine("Computing parititions took " + (t3 - t2) + "ms");
        }
コード例 #2
0
ファイル: TeraSort.cs プロジェクト: orf53975/hadoop.net
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Log.Info("starting");
            Job  job                  = Job.GetInstance(GetConf());
            Path inputDir             = new Path(args[0]);
            Path outputDir            = new Path(args[1]);
            bool useSimplePartitioner = GetUseSimplePartitioner(job);

            TeraInputFormat.SetInputPaths(job, inputDir);
            FileOutputFormat.SetOutputPath(job, outputDir);
            job.SetJobName("TeraSort");
            job.SetJarByClass(typeof(TeraSort));
            job.SetOutputKeyClass(typeof(Text));
            job.SetOutputValueClass(typeof(Text));
            job.SetInputFormatClass(typeof(TeraInputFormat));
            job.SetOutputFormatClass(typeof(TeraOutputFormat));
            if (useSimplePartitioner)
            {
                job.SetPartitionerClass(typeof(TeraSort.SimplePartitioner));
            }
            else
            {
                long start         = Runtime.CurrentTimeMillis();
                Path partitionFile = new Path(outputDir, TeraInputFormat.PartitionFilename);
                URI  partitionUri  = new URI(partitionFile.ToString() + "#" + TeraInputFormat.PartitionFilename
                                             );
                try
                {
                    TeraInputFormat.WritePartitionFile(job, partitionFile);
                }
                catch (Exception e)
                {
                    Log.Error(e.Message);
                    return(-1);
                }
                job.AddCacheFile(partitionUri);
                long end = Runtime.CurrentTimeMillis();
                System.Console.Out.WriteLine("Spent " + (end - start) + "ms computing partitions."
                                             );
                job.SetPartitionerClass(typeof(TeraSort.TotalOrderPartitioner));
            }
            job.GetConfiguration().SetInt("dfs.replication", GetOutputReplication(job));
            TeraOutputFormat.SetFinalSync(job, true);
            int ret = job.WaitForCompletion(true) ? 0 : 1;

            Log.Info("done");
            return(ret);
        }
コード例 #3
0
 public _Thread_140(JobContext job, TeraInputFormat inFormat, IList <InputSplit> splits
                    , int sampleStep, int idx, TeraInputFormat.TextSampler sampler, long recordsPerSample
                    , ThreadGroup baseArg1, string baseArg2)
     : base(baseArg1, baseArg2)
 {
     this.job              = job;
     this.inFormat         = inFormat;
     this.splits           = splits;
     this.sampleStep       = sampleStep;
     this.idx              = idx;
     this.sampler          = sampler;
     this.recordsPerSample = recordsPerSample;
     {
         this.SetDaemon(true);
     }
 }
コード例 #4
0
ファイル: TeraChecksum.cs プロジェクト: orf53975/hadoop.net
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Job job = Job.GetInstance(GetConf());

            if (args.Length != 2)
            {
                Usage();
                return(2);
            }
            TeraInputFormat.SetInputPaths(job, new Path(args[0]));
            FileOutputFormat.SetOutputPath(job, new Path(args[1]));
            job.SetJobName("TeraSum");
            job.SetJarByClass(typeof(TeraChecksum));
            job.SetMapperClass(typeof(TeraChecksum.ChecksumMapper));
            job.SetReducerClass(typeof(TeraChecksum.ChecksumReducer));
            job.SetOutputKeyClass(typeof(NullWritable));
            job.SetOutputValueClass(typeof(Unsigned16));
            // force a single reducer
            job.SetNumReduceTasks(1);
            job.SetInputFormatClass(typeof(TeraInputFormat));
            return(job.WaitForCompletion(true) ? 0 : 1);
        }
コード例 #5
0
        /// <exception cref="System.Exception"/>
        public virtual int Run(string[] args)
        {
            Job job = Job.GetInstance(GetConf());

            if (args.Length != 2)
            {
                Usage();
                return(1);
            }
            TeraInputFormat.SetInputPaths(job, new Path(args[0]));
            FileOutputFormat.SetOutputPath(job, new Path(args[1]));
            job.SetJobName("TeraValidate");
            job.SetJarByClass(typeof(TeraValidate));
            job.SetMapperClass(typeof(TeraValidate.ValidateMapper));
            job.SetReducerClass(typeof(TeraValidate.ValidateReducer));
            job.SetOutputKeyClass(typeof(Text));
            job.SetOutputValueClass(typeof(Text));
            // force a single reducer
            job.SetNumReduceTasks(1);
            // force a single split
            FileInputFormat.SetMinInputSplitSize(job, long.MaxValue);
            job.SetInputFormatClass(typeof(TeraInputFormat));
            return(job.WaitForCompletion(true) ? 0 : 1);
        }