Ejemplo n.º 1
0
 private void AdjustAverages(MLArtifact linear, double inputAdjust, double outputAdjust)
 {
     if (linear.AvgInputPips > 0)
     {
         // input
         linear.AvgDepsForInputPips           /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgInputsForInputPips         /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgOutputsForInputPips        /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgPriorityForInputPips       /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgWeightForInputPips         /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgTagCountForInputPips       /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgSemaphoreCountForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgPositionForInputPips       /= inputAdjust > 0 ? inputAdjust : 1.0;
         linear.AvgInputPips /= linear.Builds.Count;
     }
     else
     {
         // not present
         linear.AvgDepsForInputPips           = -1;
         linear.AvgInputsForInputPips         = -1;
         linear.AvgOutputsForInputPips        = -1;
         linear.AvgPriorityForInputPips       = -1;
         linear.AvgWeightForInputPips         = -1;
         linear.AvgTagCountForInputPips       = -1;
         linear.AvgSemaphoreCountForInputPips = -1;
         linear.AvgPositionForInputPips       = -1;
         linear.AvgInputPips = -1;
     }
     if (linear.AvgOutputPips > 0)
     {
         // output
         linear.AvgDepsForOutputPips           /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgInputsForOutputPips         /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgOutputsForOutputPips        /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgPriorityForOutputPips       /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgWeightForOutputPips         /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgTagCountForOutputPips       /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgSemaphoreCountForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgPositionForOutputPips       /= outputAdjust > 0 ? outputAdjust : 1.0;
         linear.AvgOutputPips /= linear.Builds.Count;
     }
     else
     {
         // not present
         linear.AvgDepsForOutputPips           = -1;
         linear.AvgInputsForOutputPips         = -1;
         linear.AvgOutputsForOutputPips        = -1;
         linear.AvgPriorityForOutputPips       = -1;
         linear.AvgWeightForOutputPips         = -1;
         linear.AvgTagCountForOutputPips       = -1;
         linear.AvgSemaphoreCountForOutputPips = -1;
         linear.AvgPositionForOutputPips       = -1;
         linear.AvgOutputPips = -1;
     }
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Write the samples in this specific format.
        /// </summary>
        protected override SampleArtifactsOutput Perform(SampleArtifactsInput input)
        {
            s_logger.Debug("SampleArtifacts starts...");
            var written = 0;

            try
            {
                // store the sample
                var sampled = new List <MLArtifact>();
                // we have the scale, now we can just randomly choose
                foreach (var scaled in input.Scale)
                {
                    var nq         = scaled.Key;
                    var linearized = input.Artifacts[nq];
                    if (linearized.Count <= scaled.Value)
                    {
                        // write all
                        foreach (var linear in linearized)
                        {
                            sampled.Add(linear);
                        }
                    }
                    else
                    {
                        // take a random set
                        var randomIds = new HashSet <int>();
                        while (randomIds.Count < scaled.Value)
                        {
                            randomIds.Add(m_random.Next(input.Artifacts[nq].Count));
                        }
                        // now take them
                        foreach (var pos in randomIds)
                        {
                            sampled.Add(linearized[pos]);
                        }
                    }
                }
                // randomize
                var randomized = sampled.OrderBy(a => m_random.Next()).ToList();
                // write the headers
                MLArtifact.WriteColumnsToStream(m_writer);
                // and then write the randomized values
                foreach (var linear in randomized)
                {
                    linear.WriteToCsvStream(m_writer);
                }
                // done
                return(new SampleArtifactsOutput(randomized, sampled.Count, input.SampleFileName));
            }
            finally
            {
                s_logger.Debug($"SampleArtifacts ends in {Stopwatch.ElapsedMilliseconds}ms, sample [{input.SampleFileName}] contains {written} artifacts");
            }
        }
Ejemplo n.º 3
0
        private Tuple <int, int> LinearizeTo(MLArtifact linear, ArtifactWithBuildMeta current)
        {
            linear.Queues.Add(current.Meta.BuildQueue);
            linear.Builds.Add(current.Meta.BuidId);
            // same hash == same size
            linear.SizeBytes = current.Artifact.ReportedSize;
            // now, do the pips....
            var totalIpips = LinearizePips(linear, current.Artifact.InputPips, current, true);
            var totalOpips = LinearizePips(linear, current.Artifact.OutputPips, current, false);

            return(new Tuple <int, int>(totalIpips, totalOpips));
        }
Ejemplo n.º 4
0
        private int LinearizePips(MLArtifact linear, List <BxlPipData> pips, ArtifactWithBuildMeta artifact, bool inputPips)
        {
            var pipCount  = 0;
            var avgDeps   = 0.0;
            var avgIns    = 0.0;
            var avgOuts   = 0.0;
            var avgPrio   = 0.0;
            var avgWeight = 0.0;
            var avgTC     = 0.0;
            var avgSC     = 0.0;
            var avgPos    = 0.0;

            foreach (var pip in pips)
            {
                avgDeps   += pip.DependencyCount;
                avgIns    += pip.InputCount;
                avgOuts   += pip.OutputCount;
                avgPrio   += pip.Priority;
                avgWeight += pip.Weight;
                avgTC     += pip.TagCount;
                avgSC     += pip.SemaphoreCount;
                avgPos    += inputPips? CalculateRelativeStartPosition(pip, artifact.Meta) : CalculateRelativeEndPosition(pip, artifact.Meta);
                ++pipCount;
            }
            // now assign depending on type
            if (inputPips)
            {
                linear.AvgDepsForInputPips           += avgDeps;
                linear.AvgInputsForInputPips         += avgIns;
                linear.AvgOutputsForInputPips        += avgOuts;
                linear.AvgPriorityForInputPips       += avgPrio;
                linear.AvgWeightForInputPips         += avgWeight;
                linear.AvgTagCountForInputPips       += avgTC;
                linear.AvgSemaphoreCountForInputPips += avgSC;
                linear.AvgPositionForInputPips       += avgPos;
                linear.AvgInputPips += pipCount;
            }
            else
            {
                linear.AvgDepsForOutputPips           += avgDeps;
                linear.AvgInputsForOutputPips         += avgIns;
                linear.AvgOutputsForOutputPips        += avgOuts;
                linear.AvgPriorityForOutputPips       += avgPrio;
                linear.AvgWeightForOutputPips         += avgWeight;
                linear.AvgTagCountForOutputPips       += avgTC;
                linear.AvgSemaphoreCountForOutputPips += avgSC;
                linear.AvgPositionForOutputPips       += avgPos;
                linear.AvgOutputPips += pipCount;
            }
            return(pipCount);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Linearize artifacts one by one
        /// </summary>
        protected override LinearizeArtifactsOutput Perform(LinearizeArtifactsInput input)
        {
            string currentFile = null;

            try
            {
                var totalIpips = 0;
                var totalOpips = 0;
                var linear     = new MLArtifact();
                // we are supposed to have a set of files here, or a list
                if (input.ArtifactsForHash == null)
                {
                    foreach (var file in Directory.EnumerateFiles(input.ArtifactFolder, "*.json"))
                    {
                        currentFile = file;
                        // so in here we will read a single input file and classify its artifacts
                        var artifact = new JsonSerializer().Deserialize <ArtifactWithBuildMeta>(
                            new JsonTextReader(
                                new StreamReader(file)
                                )
                            );
                        // so we have it in here, lets start accumulating
                        var linearResult = LinearizeTo(linear, artifact);
                        totalIpips += linearResult.Item1;
                        totalOpips += linearResult.Item2;
                        linear.ReportedPaths.Add(artifact.Artifact.ReportedFile);
                    }
                    // set the hash
                    linear.Hash = Path.GetDirectoryName(input.ArtifactFolder);
                }
                else
                {
                    foreach (var artifact in input.ArtifactsForHash)
                    {
                        // so we have it in here, lets start accumulating
                        var linearResult = LinearizeTo(linear, artifact);
                        totalIpips += linearResult.Item1;
                        totalOpips += linearResult.Item2;
                        // and the paths
                        linear.ReportedPaths.Add(artifact.Artifact.ReportedFile);
                    }
                    // set the hash
                    linear.Hash = input.Hash;
                }
                // and when we are done, we calculate the avgs
                AdjustAverages(linear, totalIpips, totalOpips);
                // and write
                if (m_database != null)
                {
                    linear.WriteToCsvStream(m_database);
                }
                // done...
                return(new LinearizeArtifactsOutput(linear.Queues.Count, linear));
            }
            catch (Exception)
            {
                s_logger.Error($"Artifact [{currentFile}] reported an exception");
                throw;
            }
            finally
            {
                // this guy will not log, this task is too small and that will hurt performance
            }
        }
Ejemplo n.º 6
0
 /// <summary>
 /// Constructor
 /// </summary>
 public LinearizeArtifactsOutput(int nq, MLArtifact lin)
 {
     NumQueues = nq;
     Linear    = lin;
 }
Ejemplo n.º 7
0
        private static void LinearizeDatabase(Args arguments)
        {
            // and a couple of checks here
            Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory");
            Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist");
            var collectedArtifacts = new MultiValueDictionary <int, MLArtifact>();
            var currentTicks       = Environment.TickCount;
            var linearFile         = $"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}.csv")}";
            var linearOutput       = TextWriter.Synchronized(new StreamWriter(linearFile));

            s_logger.Info($"Linearizing to [{linearFile}]");
            // write the headers
            MLArtifact.WriteColumnsToStream(linearOutput);
            // so now we are ready to linearize
            var linearizeBlock = new TransformBlock <LinearizeArtifactsInput, TimedActionResult <LinearizeArtifactsOutput> >(i =>
            {
                var action = new LinearizeArtifacts(linearOutput);
                return(action.PerformAction(i));
            },
                                                                                                                             new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks
            }
                                                                                                                             );
            var collectLinearResultsBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i =>
            {
                if (i.ExecutionStatus)
                {
                    collectedArtifacts.Add(i.Result.NumQueues, i.Result.Linear);
                }
            },
                                                                                                            // enforce serial
                                                                                                            new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = 1
            }
                                                                                                            );

            // connect
            linearizeBlock.LinkTo(collectLinearResultsBlock, new DataflowLinkOptions {
                PropagateCompletion = true
            });
            // and post the tasks
            var posted = 0;

            foreach (var hashDir in Directory.EnumerateDirectories(arguments.InputDirectory))
            {
                linearizeBlock.Post(new LinearizeArtifactsInput(hashDir));
                ++posted;
            }
            s_logger.Info($"Posted {posted} linearizing tasks, waiting...");
            linearizeBlock.Complete();
            // and wait
            collectLinearResultsBlock.Completion.Wait();
            // and close...
            linearOutput.Close();
            // now, scale to create the samples...
            s_logger.Info($"Creating {arguments.NumSamples} samples of size {arguments.SampleSize}");
            var scale = new Dictionary <int, int>();

            foreach (var entry in collectedArtifacts)
            {
                var queueCount = entry.Key;
                var entryCount = entry.Value.Count;
                var proportion = 1.0 * Math.BigMul(entryCount, arguments.SampleSize) / (1.0 * posted);
                scale[queueCount] = (int)Math.Ceiling(proportion);
            }
            // we have the scale, lets post tasks here
            var createSampleBlocks = new ActionBlock <SampleArtifactsInput>(i =>
            {
                var action = new SampleArtifacts();
                action.PerformAction(i);
            },
                                                                            // one per each sample
                                                                            new ExecutionDataflowBlockOptions()
            {
                MaxDegreeOfParallelism = arguments.NumSamples
            }
                                                                            );

            // post some tasks in here
            for (var i = 0; i < arguments.NumSamples; ++i)
            {
                createSampleBlocks.Post(new SampleArtifactsInput($"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}-sample{i}.csv")}", scale, collectedArtifacts));
            }
            // and wait...
            createSampleBlocks.Complete();
            createSampleBlocks.Completion.Wait();
            // done...
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Evaluates a batch of TRAINING instances and computes a confusion matrix
        /// </summary>
        public void EvaluateOnTrainingSet(string trainingFile, bool hasHeaders, bool parallel, int maxParalellism)
        {
            // parse the instances
            s_logger.Info($"Parsing instances from [{trainingFile}]");
            var    rdr       = new StreamReader(trainingFile);
            var    instances = new List <RandomForestInstance>();
            string line;

            while ((line = rdr.ReadLine()) != null)
            {
                if (hasHeaders)
                {
                    // skip column headers
                    hasHeaders = false;
                    continue;
                }
                instances.Add(MLArtifact.FromCsvString(line, DefaultPrecision));
            }
            // now start evaluating them
            s_logger.Info($"Evaluation starting...");
            var timer     = Stopwatch.StartNew();
            var confusion = new Dictionary <string, double>();

            foreach (var cl in Classes)
            {
                confusion[$"{cl}-true"]  = 0.0;
                confusion[$"{cl}-false"] = 0.0;
            }
            var instanceCounter = 0;

            foreach (var instance in instances)
            {
                var result = parallel
                    ? Classify(instance, maxParalellism)
                    : Classify(instance);

                var prediction = result.Value;

                if (MLArtifact.Evaluate(instance, prediction))
                {
                    confusion[$"{prediction}-true"] += 1;
                }
                else
                {
                    confusion[$"{prediction}-false"] += 1;
                }
                ++instanceCounter;
            }
            timer.Stop();
            var elapsedEvaluationMillis = timer.ElapsedMilliseconds;
            var perInstanceTime         = elapsedEvaluationMillis * 1.0 / instances.Count * 1.0;

            s_logger.Info($"Times: Total={elapsedEvaluationMillis}ms, PerInstanceAvg={perInstanceTime}ms");
            var correctlyClassified = 0.0;
            var errors = 0.0;

            foreach (var entry in confusion)
            {
                if (entry.Key.Contains("-true"))
                {
                    correctlyClassified += entry.Value;
                }
                else
                {
                    errors += entry.Value;
                }
            }
            s_logger.Info($"Overral accuracy: {(correctlyClassified) / (correctlyClassified + errors)}");
            s_logger.Info("Confusion Matrix:");
            foreach (var entry in confusion)
            {
                s_logger.Info($"{entry.Key}:{entry.Value}");
            }
            // per class stats
            var sharedPrecision    = confusion["Shared-true"] / (confusion["Shared-true"] + confusion["Shared-false"]);
            var sharedRecall       = confusion["Shared-true"] / (confusion["Shared-true"] + confusion["NonShared-false"]);
            var nonSharedPrecision = confusion["NonShared-true"] / (confusion["NonShared-true"] + confusion["NonShared-false"]);
            var nonSharedRecall    = confusion["NonShared-true"] / (confusion["NonShared-true"] + confusion["Shared-false"]);

            s_logger.Info("Per class stats:");
            s_logger.Info($"Shared: precision={sharedPrecision}, recall={sharedRecall}");
            s_logger.Info($"NonShared: precision={nonSharedPrecision}, recall={nonSharedRecall}");
            rdr.Close();
        }