private void AdjustAverages(MLArtifact linear, double inputAdjust, double outputAdjust) { if (linear.AvgInputPips > 0) { // input linear.AvgDepsForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgInputsForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgOutputsForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgPriorityForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgWeightForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgTagCountForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgSemaphoreCountForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgPositionForInputPips /= inputAdjust > 0 ? inputAdjust : 1.0; linear.AvgInputPips /= linear.Builds.Count; } else { // not present linear.AvgDepsForInputPips = -1; linear.AvgInputsForInputPips = -1; linear.AvgOutputsForInputPips = -1; linear.AvgPriorityForInputPips = -1; linear.AvgWeightForInputPips = -1; linear.AvgTagCountForInputPips = -1; linear.AvgSemaphoreCountForInputPips = -1; linear.AvgPositionForInputPips = -1; linear.AvgInputPips = -1; } if (linear.AvgOutputPips > 0) { // output linear.AvgDepsForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgInputsForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgOutputsForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgPriorityForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgWeightForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgTagCountForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgSemaphoreCountForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgPositionForOutputPips /= outputAdjust > 0 ? outputAdjust : 1.0; linear.AvgOutputPips /= linear.Builds.Count; } else { // not present linear.AvgDepsForOutputPips = -1; linear.AvgInputsForOutputPips = -1; linear.AvgOutputsForOutputPips = -1; linear.AvgPriorityForOutputPips = -1; linear.AvgWeightForOutputPips = -1; linear.AvgTagCountForOutputPips = -1; linear.AvgSemaphoreCountForOutputPips = -1; linear.AvgPositionForOutputPips = -1; linear.AvgOutputPips = -1; } }
/// <summary> /// Write the samples in this specific format. /// </summary> protected override SampleArtifactsOutput Perform(SampleArtifactsInput input) { s_logger.Debug("SampleArtifacts starts..."); var written = 0; try { // store the sample var sampled = new List <MLArtifact>(); // we have the scale, now we can just randomly choose foreach (var scaled in input.Scale) { var nq = scaled.Key; var linearized = input.Artifacts[nq]; if (linearized.Count <= scaled.Value) { // write all foreach (var linear in linearized) { sampled.Add(linear); } } else { // take a random set var randomIds = new HashSet <int>(); while (randomIds.Count < scaled.Value) { randomIds.Add(m_random.Next(input.Artifacts[nq].Count)); } // now take them foreach (var pos in randomIds) { sampled.Add(linearized[pos]); } } } // randomize var randomized = sampled.OrderBy(a => m_random.Next()).ToList(); // write the headers MLArtifact.WriteColumnsToStream(m_writer); // and then write the randomized values foreach (var linear in randomized) { linear.WriteToCsvStream(m_writer); } // done return(new SampleArtifactsOutput(randomized, sampled.Count, input.SampleFileName)); } finally { s_logger.Debug($"SampleArtifacts ends in {Stopwatch.ElapsedMilliseconds}ms, sample [{input.SampleFileName}] contains {written} artifacts"); } }
private Tuple <int, int> LinearizeTo(MLArtifact linear, ArtifactWithBuildMeta current) { linear.Queues.Add(current.Meta.BuildQueue); linear.Builds.Add(current.Meta.BuidId); // same hash == same size linear.SizeBytes = current.Artifact.ReportedSize; // now, do the pips.... var totalIpips = LinearizePips(linear, current.Artifact.InputPips, current, true); var totalOpips = LinearizePips(linear, current.Artifact.OutputPips, current, false); return(new Tuple <int, int>(totalIpips, totalOpips)); }
private int LinearizePips(MLArtifact linear, List <BxlPipData> pips, ArtifactWithBuildMeta artifact, bool inputPips) { var pipCount = 0; var avgDeps = 0.0; var avgIns = 0.0; var avgOuts = 0.0; var avgPrio = 0.0; var avgWeight = 0.0; var avgTC = 0.0; var avgSC = 0.0; var avgPos = 0.0; foreach (var pip in pips) { avgDeps += pip.DependencyCount; avgIns += pip.InputCount; avgOuts += pip.OutputCount; avgPrio += pip.Priority; avgWeight += pip.Weight; avgTC += pip.TagCount; avgSC += pip.SemaphoreCount; avgPos += inputPips? CalculateRelativeStartPosition(pip, artifact.Meta) : CalculateRelativeEndPosition(pip, artifact.Meta); ++pipCount; } // now assign depending on type if (inputPips) { linear.AvgDepsForInputPips += avgDeps; linear.AvgInputsForInputPips += avgIns; linear.AvgOutputsForInputPips += avgOuts; linear.AvgPriorityForInputPips += avgPrio; linear.AvgWeightForInputPips += avgWeight; linear.AvgTagCountForInputPips += avgTC; linear.AvgSemaphoreCountForInputPips += avgSC; linear.AvgPositionForInputPips += avgPos; linear.AvgInputPips += pipCount; } else { linear.AvgDepsForOutputPips += avgDeps; linear.AvgInputsForOutputPips += avgIns; linear.AvgOutputsForOutputPips += avgOuts; linear.AvgPriorityForOutputPips += avgPrio; linear.AvgWeightForOutputPips += avgWeight; linear.AvgTagCountForOutputPips += avgTC; linear.AvgSemaphoreCountForOutputPips += avgSC; linear.AvgPositionForOutputPips += avgPos; linear.AvgOutputPips += pipCount; } return(pipCount); }
/// <summary> /// Linearize artifacts one by one /// </summary> protected override LinearizeArtifactsOutput Perform(LinearizeArtifactsInput input) { string currentFile = null; try { var totalIpips = 0; var totalOpips = 0; var linear = new MLArtifact(); // we are supposed to have a set of files here, or a list if (input.ArtifactsForHash == null) { foreach (var file in Directory.EnumerateFiles(input.ArtifactFolder, "*.json")) { currentFile = file; // so in here we will read a single input file and classify its artifacts var artifact = new JsonSerializer().Deserialize <ArtifactWithBuildMeta>( new JsonTextReader( new StreamReader(file) ) ); // so we have it in here, lets start accumulating var linearResult = LinearizeTo(linear, artifact); totalIpips += linearResult.Item1; totalOpips += linearResult.Item2; linear.ReportedPaths.Add(artifact.Artifact.ReportedFile); } // set the hash linear.Hash = Path.GetDirectoryName(input.ArtifactFolder); } else { foreach (var artifact in input.ArtifactsForHash) { // so we have it in here, lets start accumulating var linearResult = LinearizeTo(linear, artifact); totalIpips += linearResult.Item1; totalOpips += linearResult.Item2; // and the paths linear.ReportedPaths.Add(artifact.Artifact.ReportedFile); } // set the hash linear.Hash = input.Hash; } // and when we are done, we calculate the avgs AdjustAverages(linear, totalIpips, totalOpips); // and write if (m_database != null) { linear.WriteToCsvStream(m_database); } // done... return(new LinearizeArtifactsOutput(linear.Queues.Count, linear)); } catch (Exception) { s_logger.Error($"Artifact [{currentFile}] reported an exception"); throw; } finally { // this guy will not log, this task is too small and that will hurt performance } }
/// <summary> /// Constructor /// </summary> public LinearizeArtifactsOutput(int nq, MLArtifact lin) { NumQueues = nq; Linear = lin; }
private static void LinearizeDatabase(Args arguments) { // and a couple of checks here Contract.Requires(arguments.InputDirectory != null, "You must specify an input directory"); Contract.Requires(Directory.Exists(arguments.InputDirectory), "The input directory must exist"); var collectedArtifacts = new MultiValueDictionary <int, MLArtifact>(); var currentTicks = Environment.TickCount; var linearFile = $"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}.csv")}"; var linearOutput = TextWriter.Synchronized(new StreamWriter(linearFile)); s_logger.Info($"Linearizing to [{linearFile}]"); // write the headers MLArtifact.WriteColumnsToStream(linearOutput); // so now we are ready to linearize var linearizeBlock = new TransformBlock <LinearizeArtifactsInput, TimedActionResult <LinearizeArtifactsOutput> >(i => { var action = new LinearizeArtifacts(linearOutput); return(action.PerformAction(i)); }, new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.AppConfig.ConcurrencyConfig.MaxArtifactLinearizationTasks } ); var collectLinearResultsBlock = new ActionBlock <TimedActionResult <LinearizeArtifactsOutput> >(i => { if (i.ExecutionStatus) { collectedArtifacts.Add(i.Result.NumQueues, i.Result.Linear); } }, // enforce serial new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = 1 } ); // connect linearizeBlock.LinkTo(collectLinearResultsBlock, new DataflowLinkOptions { PropagateCompletion = true }); // and post the tasks var posted = 0; foreach (var hashDir in Directory.EnumerateDirectories(arguments.InputDirectory)) { linearizeBlock.Post(new LinearizeArtifactsInput(hashDir)); ++posted; } s_logger.Info($"Posted {posted} linearizing tasks, waiting..."); linearizeBlock.Complete(); // and wait collectLinearResultsBlock.Completion.Wait(); // and close... linearOutput.Close(); // now, scale to create the samples... s_logger.Info($"Creating {arguments.NumSamples} samples of size {arguments.SampleSize}"); var scale = new Dictionary <int, int>(); foreach (var entry in collectedArtifacts) { var queueCount = entry.Key; var entryCount = entry.Value.Count; var proportion = 1.0 * Math.BigMul(entryCount, arguments.SampleSize) / (1.0 * posted); scale[queueCount] = (int)Math.Ceiling(proportion); } // we have the scale, lets post tasks here var createSampleBlocks = new ActionBlock <SampleArtifactsInput>(i => { var action = new SampleArtifacts(); action.PerformAction(i); }, // one per each sample new ExecutionDataflowBlockOptions() { MaxDegreeOfParallelism = arguments.NumSamples } ); // post some tasks in here for (var i = 0; i < arguments.NumSamples; ++i) { createSampleBlocks.Post(new SampleArtifactsInput($"{Path.Combine(arguments.InputDirectory, $"{Convert.ToString(currentTicks)}-sample{i}.csv")}", scale, collectedArtifacts)); } // and wait... createSampleBlocks.Complete(); createSampleBlocks.Completion.Wait(); // done... }
/// <summary> /// Evaluates a batch of TRAINING instances and computes a confusion matrix /// </summary> public void EvaluateOnTrainingSet(string trainingFile, bool hasHeaders, bool parallel, int maxParalellism) { // parse the instances s_logger.Info($"Parsing instances from [{trainingFile}]"); var rdr = new StreamReader(trainingFile); var instances = new List <RandomForestInstance>(); string line; while ((line = rdr.ReadLine()) != null) { if (hasHeaders) { // skip column headers hasHeaders = false; continue; } instances.Add(MLArtifact.FromCsvString(line, DefaultPrecision)); } // now start evaluating them s_logger.Info($"Evaluation starting..."); var timer = Stopwatch.StartNew(); var confusion = new Dictionary <string, double>(); foreach (var cl in Classes) { confusion[$"{cl}-true"] = 0.0; confusion[$"{cl}-false"] = 0.0; } var instanceCounter = 0; foreach (var instance in instances) { var result = parallel ? Classify(instance, maxParalellism) : Classify(instance); var prediction = result.Value; if (MLArtifact.Evaluate(instance, prediction)) { confusion[$"{prediction}-true"] += 1; } else { confusion[$"{prediction}-false"] += 1; } ++instanceCounter; } timer.Stop(); var elapsedEvaluationMillis = timer.ElapsedMilliseconds; var perInstanceTime = elapsedEvaluationMillis * 1.0 / instances.Count * 1.0; s_logger.Info($"Times: Total={elapsedEvaluationMillis}ms, PerInstanceAvg={perInstanceTime}ms"); var correctlyClassified = 0.0; var errors = 0.0; foreach (var entry in confusion) { if (entry.Key.Contains("-true")) { correctlyClassified += entry.Value; } else { errors += entry.Value; } } s_logger.Info($"Overral accuracy: {(correctlyClassified) / (correctlyClassified + errors)}"); s_logger.Info("Confusion Matrix:"); foreach (var entry in confusion) { s_logger.Info($"{entry.Key}:{entry.Value}"); } // per class stats var sharedPrecision = confusion["Shared-true"] / (confusion["Shared-true"] + confusion["Shared-false"]); var sharedRecall = confusion["Shared-true"] / (confusion["Shared-true"] + confusion["NonShared-false"]); var nonSharedPrecision = confusion["NonShared-true"] / (confusion["NonShared-true"] + confusion["NonShared-false"]); var nonSharedRecall = confusion["NonShared-true"] / (confusion["NonShared-true"] + confusion["Shared-false"]); s_logger.Info("Per class stats:"); s_logger.Info($"Shared: precision={sharedPrecision}, recall={sharedRecall}"); s_logger.Info($"NonShared: precision={nonSharedPrecision}, recall={nonSharedRecall}"); rdr.Close(); }