public static PrepData PrepSimulation(Excel.Application app, Excel.Workbook wbh, ProgBar pb, bool ignore_parse_errors) { // build graph var dag = new DAG(wbh, app, ignore_parse_errors); if (dag.containsLoop()) { throw new DataDebugMethods.ContainsLoopException(); } pb.IncrementProgress(); // get terminal input and terminal formula nodes once var terminal_input_nodes = dag.terminalInputVectors(); var terminal_formula_nodes = dag.terminalFormulaNodes(true); ///the boolean indicates whether to use all outputs or not if (terminal_input_nodes.Length == 0) { throw new NoRangeInputs(); } if (terminal_formula_nodes.Length == 0) { throw new NoFormulas(); } // save original spreadsheet state CellDict original_inputs = UserSimulation.Utility.SaveInputs(dag); // force a recalculation before saving outputs, otherwise we may // erroneously conclude that the procedure did the wrong thing // based solely on Excel floating-point oddities UserSimulation.Utility.InjectValues(app, wbh, original_inputs); // save function outputs CellDict correct_outputs = UserSimulation.Utility.SaveOutputs(terminal_formula_nodes, dag); return new PrepData() { dag = dag, original_inputs = original_inputs, correct_outputs = correct_outputs, terminal_input_nodes = terminal_input_nodes, terminal_formula_nodes = terminal_formula_nodes }; }
// this function returns an address but also updates // the filtered_high_scores list public static AST.Address CheckCell_Step(UserResults o, double significance, CutoffKind ck, int nboots, DAG dag, Excel.Application app, bool weighted, bool all_outputs, bool run_bootstrap, HashSet<AST.Address> known_good, ref List<KeyValuePair<AST.Address, int>> filtered_high_scores, long max_duration_in_ms, Stopwatch sw, ProgBar pb) { // Get bootstraps // The bootstrap should only re-run if there is a correction made, // not when something is marked as OK (isn't one of the introduced errors) // The list of suspected cells doesn't change when we mark something as OK, // we just move on to the next thing in the list if (run_bootstrap) { TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb); // apply a threshold to the scores filtered_high_scores = ck.applyCutoff(scores, known_good); } else //if no corrections were made (a cell was marked as OK, not corrected) { //re-filter out cells marked as OK filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList(); } if (filtered_high_scores.Count() != 0) { // get AST.Address corresponding to most unusual score return filtered_high_scores[0].Key; } else { return null; } }
public static void RunProportionExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors) { pb.setMax(5); // record intitial state of spreadsheet var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors); // init error generator var eg = new ErrorGenerator(); // get inputs as an array of addresses to facilitate random selection // DATA INPUTS ONLY AST.Address[] inputs = prepdata.dag.terminalInputCells(); // sanity check: all of the inputs should also be in prepdata.original_inputs foreach (AST.Address addr in inputs) { if (!prepdata.original_inputs.ContainsKey(addr)) { throw new Exception("Missing address!"); } } for (int i = 0; i < 100; i++) { // randomly choose an input address AST.Address rand_addr = inputs[r.Next(inputs.Length)]; // get the value String input_value = prepdata.original_inputs[rand_addr]; // perturb it String erroneous_input = eg.GenerateErrorString(input_value, c); // create an error dictionary with this one perturbed value var errors = new CellDict(); errors.Add(rand_addr, erroneous_input); // run simulations; simulation code does insertion of errors and restore of originals RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors); } }
public static void RunSimulation(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, PrepData prepdata, CellDict errors) { // write header if needed if (!System.IO.File.Exists(outfile)) { System.IO.File.AppendAllText(outfile, Simulation.HeaderRowForCSV()); } // CheckCell weighted, all outputs, quantile //var s_1 = new UserSimulation.Simulation(); //s_1.RunFromBatch(nboots, // number of bootstraps // wbh.FullName, // Excel filename // significance, // statistical significance threshold for hypothesis test // app, // Excel.Application // new QuantileCutoff(0.05), // max % extreme values to flag // c, // classification data // r, // random number generator // UserSimulation.AnalysisType.CheckCell5,// analysis type // true, // weighted analysis // true, // use all outputs for analysis // prepdata.graph, // AnalysisData // wbh, // Excel.Workbook // errors, // pre-generated errors // prepdata.terminal_input_nodes, // input range nodes // prepdata.terminal_formula_nodes, // output nodes // prepdata.original_inputs, // original input values // prepdata.correct_outputs, // original output values // max_duration_in_ms, // max duration of simulation // logfile); //System.IO.File.AppendAllText(outfile, s_1.FormatResultsAsCSV()); pb.IncrementProgress(); // CheckCell weighted, all outputs, quantile var s_4 = new UserSimulation.Simulation(); s_4.RunFromBatch(nboots, // number of bootstraps wbh.FullName, // Excel filename significance, // statistical significance of threshold app, // Excel.Application new QuantileCutoff(0.10), // max % extreme values to flag c, // classification data r, // random number generator UserSimulation.AnalysisType.CheckCell10,// analysis type true, // weighted analysis true, // use all outputs for analysis prepdata.dag, // AnalysisData wbh, // Excel.Workbook errors, // pre-generated errors prepdata.terminal_input_nodes, // input range nodes prepdata.terminal_formula_nodes, // output nodes prepdata.original_inputs, // original input values prepdata.correct_outputs, // original output values max_duration_in_ms, // max duration of simulation logfile); System.IO.File.AppendAllText(outfile, s_4.FormatResultsAsCSV()); pb.IncrementProgress(); // Normal, all inputs var s_2 = new UserSimulation.Simulation(); s_2.RunFromBatch(nboots, // irrelevant wbh.FullName, // Excel filename significance, // normal cutoff? app, // Excel.Application new NormalCutoff(threshold), // ?? c, // classification data r, // random number generator UserSimulation.AnalysisType.NormalAllInputs, // analysis type true, // irrelevant true, // irrelevant prepdata.dag, // AnalysisData wbh, // Excel.Workbook errors, // pre-generated errors prepdata.terminal_input_nodes, // input range nodes prepdata.terminal_formula_nodes, // output nodes prepdata.original_inputs, // original input values prepdata.correct_outputs, // original output values max_duration_in_ms, // max duration of simulation logfile); System.IO.File.AppendAllText(outfile, s_2.FormatResultsAsCSV()); pb.IncrementProgress(); // Normal, range inputs //var s_3 = new UserSimulation.Simulation(); //s_3.RunFromBatch(nboots, // irrelevant // wbh.FullName, // Excel filename // significance, // normal cutoff? // app, // Excel.Application // new NormalCutoff(threshold), // ?? // c, // classification data // r, // random number generator // UserSimulation.AnalysisType.NormalPerRange, // analysis type // true, // irrelevant // true, // irrelevant // prepdata.graph, // AnalysisData // wbh, // Excel.Workbook // errors, // pre-generated errors // prepdata.terminal_input_nodes, // input range nodes // prepdata.terminal_formula_nodes, // output nodes // prepdata.original_inputs, // original input values // prepdata.correct_outputs, // original output values // max_duration_in_ms, // max duration of simulation // logfile); //System.IO.File.AppendAllText(outfile, s_3.FormatResultsAsCSV()); pb.IncrementProgress(); }
public static bool RunSubletyExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors) { pb.setMax(5); // record intitial state of spreadsheet var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors); // init error generator var eg = new ErrorGenerator(); // get inputs as an array of addresses to facilitate random selection // DATA INPUTS ONLY AST.Address[] inputs = prepdata.dag.terminalInputCells(); for (int i = 0; i < 100; i++) { // randomly choose a *numeric* input // TODO: use Fischer-Yates and take values until // either we have a satisfactory input value or none // remain var rnd_addrs = inputs.Shuffle().ToList(); bool num_found = false; String input_string; double input_value; AST.Address rand_addr; do { // randomly choose an address; if there are none left, fail if (rnd_addrs.Count == 0) { return false; } rand_addr = rnd_addrs.First(); rnd_addrs = rnd_addrs.Skip(1).ToList(); // get the value input_string = prepdata.original_inputs[rand_addr]; // try parsing it if (Double.TryParse(input_string, out input_value)) { num_found = true; } } while (!num_found); // perturb it String erroneous_input = eg.GenerateSubtleErrorString(input_value, c); // create an error dictionary with this one perturbed value var errors = new CellDict(); errors.Add(rand_addr, erroneous_input); // run simulations; simulation code does insertion of errors and restore of originals RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors); } return true; }
public static void RunSimulationPaperMain(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors) { pb.setMax(5); // record intitial state of spreadsheet var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors); // generate errors CellDict errors = UserSimulation.Utility.GenImportantErrors(prepdata.terminal_formula_nodes, prepdata.original_inputs, 5, prepdata.correct_outputs, app, wbh, c, prepdata.dag); // run paper simulations RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors); }
// num_bootstraps: the number of bootstrap samples to get // inputs: a list of inputs; each TreeNode represents an entire input range // outputs: a list of outputs; each TreeNode represents a function public static TreeScore DataDebug(int num_bootstraps, DAG dag, Excel.Application app, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, double significance, ProgBar pb) { // this modifies the weights of each node PropagateWeights(dag); // filter out non-terminal functions var output_fns = dag.terminalFormulaNodes(all_outputs); // filter out non-terminal inputs var input_rngs = dag.terminalInputVectors(); // first idx: the index of the TreeNode in the "inputs" array // second idx: the ith bootstrap var resamples = new InputSample[input_rngs.Length][]; // RNG for sampling var rng = new Random(); // we save initial inputs and outputs here var initial_inputs = StoreInputs(input_rngs, dag); var initial_outputs = StoreOutputs(output_fns, dag); // Set progress bar max pb.setMax(input_rngs.Length * 2); #region RESAMPLE // populate bootstrap array // for each input range (a TreeNode) for (int i = 0; i < input_rngs.Length; i++) { // this TreeNode var t = input_rngs[i]; // resample resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng); // update progress bar pb.IncrementProgress(); } #endregion RESAMPLE #region INFERENCE return Inference( num_bootstraps, resamples, initial_inputs, initial_outputs, input_rngs, output_fns, dag, weighted, significance, pb); #endregion INFERENCE }
public static TreeScore Inference( int num_bootstraps, InputSample[][] resamples, Dictionary<AST.Range, InputSample> initial_inputs, Dictionary<AST.Address, string> initial_outputs, AST.Range[] input_arr, AST.Address[] output_arr, DAG dag, bool weighted, double significance, ProgBar pb) { // synchronization token object lock_token = new Object(); // init thread event notification array var mres = new ManualResetEvent[input_arr.Length]; // init job storage var ddjs = new DataDebugJob[input_arr.Length]; // init started jobs count var sjobs = 0; // init completed jobs count var cjobs = 0; // last-ditch effort flag bool last_try = false; // init score storage var scores = new TreeScore(); for (int i = 0; i < input_arr.Length; i++) { try { #region BOOTSTRAP // bootstrapping is done in the parent STA thread because // the .NET threading model prohibits thread pools (which // are MTA) from accessing STA COM objects directly. // alloc bootstrap storage for each output (f), for each resample (b) FunctionOutput<string>[][] bs = new FunctionOutput<string>[initial_outputs.Count][]; for (int f = 0; f < initial_outputs.Count; f++) { bs[f] = new FunctionOutput<string>[num_bootstraps]; } // init memoization table for input vector i var memo = new BootMemo(); // fetch the input range TreeNode var input = input_arr[i]; // fetch the input range COM object var com = dag.getCOMRefForRange(input).Range; // compute outputs // replace the values of the COM object with the jth bootstrap, // save all function outputs, and // restore the original input for (var b = 0; b < num_bootstraps; b++) { // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them FunctionOutput<string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false); for (var f = 0; f < output_arr.Length; f++) { bs[f][b] = fos[f]; } } // restore the original inputs; faster to do once, after bootstrapping is done BootMemo.ReplaceExcelRange(com, initial_inputs[input]); // TODO: restore formulas if it turns out that they were overwrittern // this should never be the case #endregion BOOTSTRAP #region HYPOTHESIS_TEST // cancellation token mres[i] = new ManualResetEvent(false); // set up job ddjs[i] = new DataDebugJob( dag, bs, initial_outputs, input_arr[i], output_arr, weighted, significance, mres[i] ); sjobs++; // hand job to thread pool ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i); #endregion HYPOTHESIS_TEST // update progress bar pb.IncrementProgress(); } catch (System.OutOfMemoryException e) { if (!last_try) { // If there are no more jobs running, but // we still can't allocate memory, try invoking // GC and then trying again cjobs = mres.Count(mre => mre.WaitOne(0)); if (sjobs - cjobs == 0) { GC.Collect(); last_try = true; } } else { // we just don't have enough memory throw e; } // wait for any of the 0..i-1 work items // to complete and try again WaitHandle.WaitAny(mres.Take(i).ToArray()); } } // Do not proceed until all hypothesis tests are done. // WaitHandle.WaitAll cannot be called on an STA thread which // is why we call WaitOne in a loop. // Merge scores as data becomes available. for (int i = 0; i < input_arr.Length; i++) { mres[i].WaitOne(); scores = DictAdd(scores, ddjs[i].Result); } return scores; }
public static TreeScore Inference( int num_bootstraps, InputSample[][] resamples, Dictionary <AST.Range, InputSample> initial_inputs, Dictionary <AST.Address, string> initial_outputs, AST.Range[] input_arr, AST.Address[] output_arr, DAG dag, bool weighted, double significance, ProgBar pb) { // synchronization token object lock_token = new Object(); // init thread event notification array var mres = new ManualResetEvent[input_arr.Length]; // init job storage var ddjs = new DataDebugJob[input_arr.Length]; // init started jobs count var sjobs = 0; // init completed jobs count var cjobs = 0; // last-ditch effort flag bool last_try = false; // init score storage var scores = new TreeScore(); for (int i = 0; i < input_arr.Length; i++) { try { #region BOOTSTRAP // bootstrapping is done in the parent STA thread because // the .NET threading model prohibits thread pools (which // are MTA) from accessing STA COM objects directly. // alloc bootstrap storage for each output (f), for each resample (b) FunctionOutput <string>[][] bs = new FunctionOutput <string> [initial_outputs.Count][]; for (int f = 0; f < initial_outputs.Count; f++) { bs[f] = new FunctionOutput <string> [num_bootstraps]; } // init memoization table for input vector i var memo = new BootMemo(); // fetch the input range TreeNode var input = input_arr[i]; // fetch the input range COM object var com = dag.getCOMRefForRange(input).Range; // compute outputs // replace the values of the COM object with the jth bootstrap, // save all function outputs, and // restore the original input for (var b = 0; b < num_bootstraps; b++) { // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them FunctionOutput <string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false); for (var f = 0; f < output_arr.Length; f++) { bs[f][b] = fos[f]; } } // restore the original inputs; faster to do once, after bootstrapping is done BootMemo.ReplaceExcelRange(com, initial_inputs[input]); // TODO: restore formulas if it turns out that they were overwrittern // this should never be the case #endregion BOOTSTRAP #region HYPOTHESIS_TEST // cancellation token mres[i] = new ManualResetEvent(false); // set up job ddjs[i] = new DataDebugJob( dag, bs, initial_outputs, input_arr[i], output_arr, weighted, significance, mres[i] ); sjobs++; // hand job to thread pool ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i); #endregion HYPOTHESIS_TEST // update progress bar pb.IncrementProgress(); } catch (System.OutOfMemoryException e) { if (!last_try) { // If there are no more jobs running, but // we still can't allocate memory, try invoking // GC and then trying again cjobs = mres.Count(mre => mre.WaitOne(0)); if (sjobs - cjobs == 0) { GC.Collect(); last_try = true; } } else { // we just don't have enough memory throw e; } // wait for any of the 0..i-1 work items // to complete and try again WaitHandle.WaitAny(mres.Take(i).ToArray()); } } // Do not proceed until all hypothesis tests are done. // WaitHandle.WaitAll cannot be called on an STA thread which // is why we call WaitOne in a loop. // Merge scores as data becomes available. for (int i = 0; i < input_arr.Length; i++) { mres[i].WaitOne(); scores = DictAdd(scores, ddjs[i].Result); } return(scores); }
// num_bootstraps: the number of bootstrap samples to get // inputs: a list of inputs; each TreeNode represents an entire input range // outputs: a list of outputs; each TreeNode represents a function public static TreeScore DataDebug(int num_bootstraps, DAG dag, Excel.Application app, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, double significance, ProgBar pb) { // this modifies the weights of each node PropagateWeights(dag); // filter out non-terminal functions var output_fns = dag.terminalFormulaNodes(all_outputs); // filter out non-terminal inputs var input_rngs = dag.terminalInputVectors(); // first idx: the index of the TreeNode in the "inputs" array // second idx: the ith bootstrap var resamples = new InputSample[input_rngs.Length][]; // RNG for sampling var rng = new Random(); // we save initial inputs and outputs here var initial_inputs = StoreInputs(input_rngs, dag); var initial_outputs = StoreOutputs(output_fns, dag); // Set progress bar max pb.setMax(input_rngs.Length * 2); #region RESAMPLE // populate bootstrap array // for each input range (a TreeNode) for (int i = 0; i < input_rngs.Length; i++) { // this TreeNode var t = input_rngs[i]; // resample resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng); // update progress bar pb.IncrementProgress(); } #endregion RESAMPLE #region INFERENCE return(Inference( num_bootstraps, resamples, initial_inputs, initial_outputs, input_rngs, output_fns, dag, weighted, significance, pb)); #endregion INFERENCE }
public void Analyze(long max_duration_in_ms) { var sw = new System.Diagnostics.Stopwatch(); sw.Start(); using (var pb = new ProgBar()) { // Disable screen updating during analysis to speed things up _app.ScreenUpdating = false; // Build dependency graph (modifies data) try { _dag = new DAG(_app.ActiveWorkbook, _app, IGNORE_PARSE_ERRORS); var num_input_cells = _dag.numberOfInputCells(); } catch (ExcelParserUtility.ParseException e) { // cleanup UI and then rethrow _app.ScreenUpdating = true; throw e; } if (_dag.terminalInputVectors().Length == 0) { System.Windows.Forms.MessageBox.Show("This spreadsheet contains no vector-input functions."); _app.ScreenUpdating = true; _flaggable = new KeyValuePair<AST.Address, int>[0]; return; } // Get bootstraps var scores = Analysis.DataDebug(NBOOTS, _dag, _app, weighted: USE_WEIGHTS, all_outputs: CONSIDER_ALL_OUTPUTS, max_duration_in_ms: max_duration_in_ms, sw: sw, significance: _tool_significance, pb: pb) .OrderByDescending(pair => pair.Value).ToArray(); if (_debug_mode) { var score_str = String.Join("\n", scores.Take(10).Select(score => score.Key.A1FullyQualified() + " -> " + score.Value.ToString())); System.Windows.Forms.MessageBox.Show(score_str); System.Windows.Forms.Clipboard.SetText(score_str); } List<KeyValuePair<AST.Address, int>> high_scores = new List<KeyValuePair<AST.Address, int>>(); // calculate cutoff idnex int thresh = scores.Length - Convert.ToInt32(scores.Length * _tool_significance); // filter out cells that are... _flaggable = scores.Where(pair => pair.Value >= scores[thresh].Value) // below threshold .Where(pair => !_known_good.Contains(pair.Key)) // known to be good .Where(pair => pair.Value != 0).ToArray(); // score == 0 // Enable screen updating when we're done _app.ScreenUpdating = true; sw.Stop(); } }
private static void RunSubletyExperiment(Excel.Application app, Excel.Workbook wb, Random rng, UserSimulation.Classification c, string output_dir, double thresh, ProgBar pb) { // number of bootstraps var NBOOTS = 2700; // the full path of this workbook var filename = app.ActiveWorkbook.Name; // the default output filename var r = new System.Text.RegularExpressions.Regex(@"(.+)\.xls|xlsx", System.Text.RegularExpressions.RegexOptions.Compiled); var default_output_file = "simulation_results.csv"; var default_log_file = r.Match(filename).Groups[1].Value + ".iterlog.csv"; // save file location (will append for additional runs) var savefile = System.IO.Path.Combine(output_dir, default_output_file); // log file location (new file for each new workbook) var logfile = System.IO.Path.Combine(output_dir, default_log_file); // disable screen updating app.ScreenUpdating = false; // run simulations if (!UserSimulation.Config.RunSubletyExperiment(app, wb, NBOOTS, 0.95, thresh, c, rng, savefile, MAX_DURATION_IN_MS, logfile, pb, IGNORE_PARSE_ERRORS)) { System.Windows.Forms.MessageBox.Show("This spreadsheet contains no numeric inputs."); } // enable screen updating app.ScreenUpdating = true; }
// remove errors until none remain private UserResults SimulateUser(int nboots, double significance, CutoffKind ck, DAG dag, CellDict original_inputs, CellDict errord, CellDict correct_outputs, Excel.Workbook wb, Excel.Application app, AnalysisType analysis_type, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, String logfile, ProgBar pb ) { // init user results data structure var o = new UserResults(); HashSet<AST.Address> known_good = new HashSet<AST.Address>(); // initialize procedure var errors_remain = true; var max_errors = new ErrorDict(); var incorrect_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag); var errors_found = 0; var number_of_true_errors = errord.Count; Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors); // the corrected state of the spreadsheet CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value); // remove errors loop var cells_inspected = 0; List<KeyValuePair<AST.Address, int>> filtered_high_scores = null; bool correction_made = true; while (errors_remain) { Console.Write("."); AST.Address flagged_cell = null; // choose the appropriate test if (analysis_type == AnalysisType.CheckCell5 || analysis_type == AnalysisType.CheckCell10 ) { flagged_cell = SimulationStep.CheckCell_Step(o, significance, ck, nboots, dag, app, weighted, all_outputs, correction_made, known_good, ref filtered_high_scores, max_duration_in_ms, sw, pb); } else if (analysis_type == AnalysisType.NormalPerRange) { flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw); } else if (analysis_type == AnalysisType.NormalAllInputs) { flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw); } // stop if the test no longer returns anything or if // the test is simply done inspecting based on a fixed threshold if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected)) { errors_remain = false; } else // a cell was flagged { //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged, //then nothing is inspected, so cells_inspected doesn't increase. cells_inspected += 1; // check to see if the flagged value is actually an error if (errord.ContainsKey(flagged_cell)) { correction_made = true; errors_found += 1; // P(k) * rel(k) o.PrecRel_at_k.Add(errors_found / (double)cells_inspected); o.true_positives.Add(flagged_cell); // correct flagged cell flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell]; Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors); // compute total error after applying this correction var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs); o.current_total_error.Add(current_total_error); // save outputs partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag); } else { correction_made = false; // numerator is 0 here because rel(k) = 0 when no error was found o.PrecRel_at_k.Add(0.0); o.false_positives.Add(flagged_cell); } // mark it as known good -- at this point the cell has been // 'inspected' regardless of whether it was an error // It was either corrected or marked as OK known_good.Add(flagged_cell); // compute output error magnitudes var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs); // compute input error magnitude double num_input_error_magnitude; double str_input_error_magnitude; if (errord.ContainsKey(flagged_cell)) { if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell])) { num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell])); str_input_error_magnitude = 0; } else { num_input_error_magnitude = 0; str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]); } } else { num_input_error_magnitude = 0; str_input_error_magnitude = 0; } // write error log var logentry = new LogEntry(analysis_type, wb.Name, flagged_cell, original_inputs[flagged_cell], errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell], output_error_magnitude, num_input_error_magnitude, str_input_error_magnitude, true, correction_made, significance, ck.Threshold); logentry.WriteLog(logfile); _error_log.Add(logentry); } } // find all of the false negatives o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord); o.max_errors = max_errors; var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs); // write out all false negative information foreach (AST.Address fn in o.false_negatives) { double num_input_error_magnitude; double str_input_error_magnitude; if (Utility.BothNumbers(errord[fn], original_inputs[fn])) { num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn])); str_input_error_magnitude = 0; } else { num_input_error_magnitude = 0; str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]); } // write error log _error_log.Add(new LogEntry(analysis_type, wb.Name, fn, original_inputs[fn], errord[fn], last_out_err_mag, num_input_error_magnitude, str_input_error_magnitude, false, true, significance, ck.Threshold)); } return o; }
// returns the number of cells inspected public int Run(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test CutoffKind ck, // kind of threshold function to use Excel.Application app, // reference to Excel app Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, Excel.Workbook wb, AST.Address[] terminal_formula_cells, AST.Range[] terminal_input_vectors, CellDict original_inputs, CellDict correct_outputs, long max_duration_in_ms, String logfile, //filename for the output log ProgBar pb ) { //set wbname and path _wb_name = xlfile; _wb_path = wb.Path; _analysis_type = analysisType; _significance = significance; _all_outputs = all_outputs; _weighted = weighted; //Now we want to inject the errors from _errors Utility.InjectValues(app, wb, _errors); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); //Time the removal of errors Stopwatch sw = new Stopwatch(); sw.Start(); // remove errors until none remain; MODIFIES WORKBOOK _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb); sw.Stop(); TimeSpan elapsed = sw.Elapsed; _analysis_time = elapsed.TotalSeconds; // save partially-corrected outputs var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); // compute total relative error _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors); _total_relative_error = Utility.TotalRelativeError(_error); // compute starting total relative error (normalized by max_errors) ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors); _initial_total_relative_error = Utility.TotalRelativeError(starting_error); // effort _max_effort = dag.allCells().Length; _effort = (_user.true_positives.Count + _user.false_positives.Count); _expended_effort = (double)_effort / (double)_max_effort; // compute average precision // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives| // where P(k) is the precision at threshold k, // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count; // restore original values Utility.InjectValues(app, wb, original_inputs); _tree_construct_time = dag.AnalysisMilliseconds / 1000.0; // flag that we're done; safe to print output results _simulation_run = true; // return the number of cells inspected return _effort; }