public FunctionOutput <string>[] FastReplace(Excel.Range com, DAG dag, InputSample original, InputSample sample, AST.Address[] outputs, bool replace_original) { FunctionOutput <string>[] fo_arr; if (!_d.TryGetValue(sample, out fo_arr)) { // replace the COM value ReplaceExcelRange(com, sample); // initialize array fo_arr = new FunctionOutput <string> [outputs.Length]; // grab all outputs for (var k = 0; k < outputs.Length; k++) { // save the output fo_arr[k] = new FunctionOutput <string>(dag.readCOMValueAtAddress(outputs[k]), sample.GetExcludes()); } // Add function values to cache // Don't care about return value _d.Add(sample, fo_arr); // restore the COM value if (replace_original) { ReplaceExcelRange(com, original); } } return(fo_arr); }
private static int PropagateNodeWeight(AST.Address node, DAG dag) { // if the node is a formula, recursively // compute its weight if (dag.isFormula(node)) { // get input nodes var vector_rngs = dag.getFormulaInputVectors(node); var scinputs = dag.getFormulaSingleCellInputs(node); var inputs = vector_rngs.SelectMany(vrng => vrng.Addresses()).ToList(); inputs.AddRange(scinputs); // call recursively and sum components var weight = 0; foreach (var input in inputs) { weight += PropagateNodeWeight(input, dag); } dag.setWeight(node, weight); return(weight); } // node is an input else { dag.setWeight(node, 1); return(1); } }
public FunctionOutput<string>[] FastReplace(Excel.Range com, DAG dag, InputSample original, InputSample sample, AST.Address[] outputs, bool replace_original) { FunctionOutput<string>[] fo_arr; if (!_d.TryGetValue(sample, out fo_arr)) { // replace the COM value ReplaceExcelRange(com, sample); // initialize array fo_arr = new FunctionOutput<string>[outputs.Length]; // grab all outputs for (var k = 0; k < outputs.Length; k++) { // save the output fo_arr[k] = new FunctionOutput<string>(dag.readCOMValueAtAddress(outputs[k]), sample.GetExcludes()); } // Add function values to cache // Don't care about return value _d.Add(sample, fo_arr); // restore the COM value if (replace_original) { ReplaceExcelRange(com, original); } } return fo_arr; }
public static TreeScore NumericHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput <string>[] boots, string initial_output, bool weighted, double significance) { // this function's input cells var input_cells = rangeNode.Addresses(); var inputs_sz = input_cells.Count(); // scores var input_exclusion_scores = new TreeScore(); // convert to numeric var numeric_boots = ConvertToNumericOutput(boots); // sort var sorted_num_boots = SortBootstraps(numeric_boots); // for each excluded index, test whether the original input // falls outside our bootstrap confidence bounds for (int i = 0; i < inputs_sz; i++) { // default weight int weight = 1; // add weight to score if test fails AST.Address xtree = input_cells[i]; if (weighted) { // the weight of the function value of interest weight = dag.getWeight(functionNode); } double outlieriness = RejectNullHypothesis(sorted_num_boots, initial_output, i, significance); if (outlieriness != 0.0) { // get the xth indexed input in input_rng i if (input_exclusion_scores.ContainsKey(xtree)) { input_exclusion_scores[xtree] += (int)(weight * outlieriness); } else { input_exclusion_scores.Add(xtree, (int)(weight * outlieriness)); } } else { // we need to at least add the value to the tree if (!input_exclusion_scores.ContainsKey(xtree)) { input_exclusion_scores.Add(xtree, 0); } } } return(input_exclusion_scores); }
// Propagate weights private static void PropagateWeights(DAG dag) { if (dag.containsLoop()) { throw new ContainsLoopException(); } // starting set of functions; roots in the forest var formulas = dag.terminalFormulaNodes(false); // for each forest foreach (AST.Address f in formulas) { dag.setWeight(f, PropagateNodeWeight(f, dag)); } }
public static TreeScore StringHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput <string>[] boots, string initial_output, bool weighted, double significance) { // this function's input cells var input_cells = rangeNode.Addresses(); // scores var iexc_scores = new TreeScore(); var inputs_sz = input_cells.Count(); // exclude each index, in turn for (int i = 0; i < inputs_sz; i++) { // default weight int weight = 1; // add weight to score if test fails AST.Address xtree = input_cells[i]; if (weighted) { // the weight of the function value of interest weight = dag.getWeight(functionNode); } if (RejectNullHypothesis(boots, initial_output, i, significance)) { if (iexc_scores.ContainsKey(xtree)) { iexc_scores[xtree] += weight; } else { iexc_scores.Add(xtree, weight); } } else { // we need to at least add the value to the tree if (!iexc_scores.ContainsKey(xtree)) { iexc_scores.Add(xtree, 0); } } } return(iexc_scores); }
public static PrepData PrepSimulation(Excel.Application app, Excel.Workbook wbh, ProgBar pb, bool ignore_parse_errors) { // build graph var dag = new DAG(wbh, app, ignore_parse_errors); if (dag.containsLoop()) { throw new DataDebugMethods.ContainsLoopException(); } pb.IncrementProgress(); // get terminal input and terminal formula nodes once var terminal_input_nodes = dag.terminalInputVectors(); var terminal_formula_nodes = dag.terminalFormulaNodes(true); ///the boolean indicates whether to use all outputs or not if (terminal_input_nodes.Length == 0) { throw new NoRangeInputs(); } if (terminal_formula_nodes.Length == 0) { throw new NoFormulas(); } // save original spreadsheet state CellDict original_inputs = UserSimulation.Utility.SaveInputs(dag); // force a recalculation before saving outputs, otherwise we may // erroneously conclude that the procedure did the wrong thing // based solely on Excel floating-point oddities UserSimulation.Utility.InjectValues(app, wbh, original_inputs); // save function outputs CellDict correct_outputs = UserSimulation.Utility.SaveOutputs(terminal_formula_nodes, dag); return new PrepData() { dag = dag, original_inputs = original_inputs, correct_outputs = correct_outputs, terminal_input_nodes = terminal_input_nodes, terminal_formula_nodes = terminal_formula_nodes }; }
// this function returns an address but also updates // the filtered_high_scores list public static AST.Address CheckCell_Step(UserResults o, double significance, CutoffKind ck, int nboots, DAG dag, Excel.Application app, bool weighted, bool all_outputs, bool run_bootstrap, HashSet<AST.Address> known_good, ref List<KeyValuePair<AST.Address, int>> filtered_high_scores, long max_duration_in_ms, Stopwatch sw, ProgBar pb) { // Get bootstraps // The bootstrap should only re-run if there is a correction made, // not when something is marked as OK (isn't one of the introduced errors) // The list of suspected cells doesn't change when we mark something as OK, // we just move on to the next thing in the list if (run_bootstrap) { TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb); // apply a threshold to the scores filtered_high_scores = ck.applyCutoff(scores, known_good); } else //if no corrections were made (a cell was marked as OK, not corrected) { //re-filter out cells marked as OK filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList(); } if (filtered_high_scores.Count() != 0) { // get AST.Address corresponding to most unusual score return filtered_high_scores[0].Key; } else { return null; } }
private TreeScore _score; // dict of exclusion scores for each input CELL TreeNode public DataDebugJob( DAG dag, FunctionOutput <String>[][] bs, Dictionary <AST.Address, string> initial_outputs, AST.Range input, AST.Address[] output_arr, bool weighted, double significance, ManualResetEvent mre) { _dag = dag; _bs = bs; _initial_outputs = initial_outputs; _input = input; _outputs = output_arr; _weighted = weighted; _significance = significance; _mre = mre; _score = new TreeScore(); }
public static AST.Address NormalAllOutputs_Step(DAG dag, Excel.Application app, Excel.Workbook wb, HashSet<AST.Address> known_good, long max_duration_in_ms, Stopwatch sw) { AST.Address flagged_cell = null; //Generate a normal distribution for the entire set of inputs var normal_dist = new DataDebugMethods.NormalDistribution(dag.terminalInputVectors(), app); // Get top outlier if (normal_dist.getErrorsCount() > 0) { for (int i = 0; i < normal_dist.getErrorsCount(); i++) { // check for timeout if (sw.ElapsedMilliseconds > max_duration_in_ms) { throw new TimeoutException("Timeout exception in NormalAllOutputs_Step."); } var flagged_com = normal_dist.getErrorAtPosition(i); flagged_cell = AST.Address.AddressFromCOMObject(flagged_com, wb); if (known_good.Contains(flagged_cell)) { flagged_cell = null; } else { break; } } } return flagged_cell; }
public static TreeScore StringHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput<string>[] boots, string initial_output, bool weighted, double significance) { // this function's input cells var input_cells = rangeNode.Addresses(); // scores var iexc_scores = new TreeScore(); var inputs_sz = input_cells.Count(); // exclude each index, in turn for (int i = 0; i < inputs_sz; i++) { // default weight int weight = 1; // add weight to score if test fails AST.Address xtree = input_cells[i]; if (weighted) { // the weight of the function value of interest weight = dag.getWeight(functionNode); } if (RejectNullHypothesis(boots, initial_output, i, significance)) { if (iexc_scores.ContainsKey(xtree)) { iexc_scores[xtree] += weight; } else { iexc_scores.Add(xtree, weight); } } else { // we need to at least add the value to the tree if (!iexc_scores.ContainsKey(xtree)) { iexc_scores.Add(xtree, 0); } } } return iexc_scores; }
public static Dictionary<AST.Address, string> StoreOutputs(AST.Address[] outputs, DAG dag) { // output dict var d = new Dictionary<AST.Address, string>(); // partition all of the output addresses by their worksheet var addr_groups = outputs.GroupBy(addr => dag.getCOMRefForAddress(addr).WorksheetName); // for each worksheet, do an array read of the formulas foreach (IEnumerable<AST.Address> ws_fns in addr_groups) { // get worksheet used range var fstcr = dag.getCOMRefForAddress(ws_fns.First()); var rng = fstcr.Worksheet.UsedRange; // get used range dimensions var left = rng.Column; var right = rng.Columns.Count + left - 1; var top = rng.Row; var bottom = rng.Rows.Count + top - 1; // get names var wsname = new FSharpOption<string>(fstcr.WorksheetName); var wbname = new FSharpOption<string>(fstcr.WorkbookName); var path = fstcr.Path; // sometimes the used range is a range if (left != right || top != bottom) { // y is the first index // x is the second index object[,] data = rng.Value2; // fast array read var x_del = left - 1; var y_del = top - 1; foreach (AST.Address addr in ws_fns) { // construct address in formulas array var x = addr.X - x_del; var y = addr.Y - y_del; // get string String s = System.Convert.ToString(data[y, x]); if (String.IsNullOrWhiteSpace(s)) { d.Add(addr, ""); } else { d.Add(addr, s); } } } // and other times it is a single cell else { // construct the appropriate AST.Address AST.Address addr = AST.Address.NewFromR1C1(top, left, wsname, wbname, path); // make certain that it is actually a string String s = System.Convert.ToString(rng.Value2); // add to dictionary, as appropriate if (String.IsNullOrWhiteSpace(s)) { d.Add(addr, ""); } else { d.Add(addr, s); } } } return d; }
public static TreeScore NumericHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput<string>[] boots, string initial_output, bool weighted, double significance) { // this function's input cells var input_cells = rangeNode.Addresses(); var inputs_sz = input_cells.Count(); // scores var input_exclusion_scores = new TreeScore(); // convert to numeric var numeric_boots = ConvertToNumericOutput(boots); // sort var sorted_num_boots = SortBootstraps(numeric_boots); // for each excluded index, test whether the original input // falls outside our bootstrap confidence bounds for (int i = 0; i < inputs_sz; i++) { // default weight int weight = 1; // add weight to score if test fails AST.Address xtree = input_cells[i]; if (weighted) { // the weight of the function value of interest weight = dag.getWeight(functionNode); } double outlieriness = RejectNullHypothesis(sorted_num_boots, initial_output, i, significance); if (outlieriness != 0.0) { // get the xth indexed input in input_rng i if (input_exclusion_scores.ContainsKey(xtree)) { input_exclusion_scores[xtree] += (int)(weight * outlieriness); } else { input_exclusion_scores.Add(xtree, (int)(weight * outlieriness)); } } else { // we need to at least add the value to the tree if (!input_exclusion_scores.ContainsKey(xtree)) { input_exclusion_scores.Add(xtree, 0); } } } return input_exclusion_scores; }
public static TreeScore Inference( int num_bootstraps, InputSample[][] resamples, Dictionary<AST.Range, InputSample> initial_inputs, Dictionary<AST.Address, string> initial_outputs, AST.Range[] input_arr, AST.Address[] output_arr, DAG dag, bool weighted, double significance, ProgBar pb) { // synchronization token object lock_token = new Object(); // init thread event notification array var mres = new ManualResetEvent[input_arr.Length]; // init job storage var ddjs = new DataDebugJob[input_arr.Length]; // init started jobs count var sjobs = 0; // init completed jobs count var cjobs = 0; // last-ditch effort flag bool last_try = false; // init score storage var scores = new TreeScore(); for (int i = 0; i < input_arr.Length; i++) { try { #region BOOTSTRAP // bootstrapping is done in the parent STA thread because // the .NET threading model prohibits thread pools (which // are MTA) from accessing STA COM objects directly. // alloc bootstrap storage for each output (f), for each resample (b) FunctionOutput<string>[][] bs = new FunctionOutput<string>[initial_outputs.Count][]; for (int f = 0; f < initial_outputs.Count; f++) { bs[f] = new FunctionOutput<string>[num_bootstraps]; } // init memoization table for input vector i var memo = new BootMemo(); // fetch the input range TreeNode var input = input_arr[i]; // fetch the input range COM object var com = dag.getCOMRefForRange(input).Range; // compute outputs // replace the values of the COM object with the jth bootstrap, // save all function outputs, and // restore the original input for (var b = 0; b < num_bootstraps; b++) { // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them FunctionOutput<string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false); for (var f = 0; f < output_arr.Length; f++) { bs[f][b] = fos[f]; } } // restore the original inputs; faster to do once, after bootstrapping is done BootMemo.ReplaceExcelRange(com, initial_inputs[input]); // TODO: restore formulas if it turns out that they were overwrittern // this should never be the case #endregion BOOTSTRAP #region HYPOTHESIS_TEST // cancellation token mres[i] = new ManualResetEvent(false); // set up job ddjs[i] = new DataDebugJob( dag, bs, initial_outputs, input_arr[i], output_arr, weighted, significance, mres[i] ); sjobs++; // hand job to thread pool ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i); #endregion HYPOTHESIS_TEST // update progress bar pb.IncrementProgress(); } catch (System.OutOfMemoryException e) { if (!last_try) { // If there are no more jobs running, but // we still can't allocate memory, try invoking // GC and then trying again cjobs = mres.Count(mre => mre.WaitOne(0)); if (sjobs - cjobs == 0) { GC.Collect(); last_try = true; } } else { // we just don't have enough memory throw e; } // wait for any of the 0..i-1 work items // to complete and try again WaitHandle.WaitAny(mres.Take(i).ToArray()); } } // Do not proceed until all hypothesis tests are done. // WaitHandle.WaitAll cannot be called on an STA thread which // is why we call WaitOne in a loop. // Merge scores as data becomes available. for (int i = 0; i < input_arr.Length; i++) { mres[i].WaitOne(); scores = DictAdd(scores, ddjs[i].Result); } return scores; }
// Get dictionary of inputs and the error they produce public Dictionary<AST.Address, Tuple<string, double>> TopOfKErrors(AST.Address[] terminal_formula_nodes, CellDict inputs, int k, CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, string classification_file, DAG dag) { var eg = new ErrorGenerator(); var c = Classification.Deserialize(classification_file); var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>(); foreach (KeyValuePair<AST.Address,string> pair in inputs) { AST.Address addr = pair.Key; string orig_value = pair.Value; //Load in the classification's dictionaries double max_error_produced = 0.0; string max_error_string = ""; // get k strings, in parallel string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k); for (int i = 0; i < k; i++) { CellDict cd = new CellDict(); cd.Add(addr, errorstrings[i]); //inject the typo Utility.InjectValues(app, wb, cd); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_nodes, dag); //remove the typo that was introduced cd.Clear(); cd.Add(addr, orig_value); Utility.InjectValues(app, wb, cd); double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs); //keep track of the largest observed max error if (total_error > max_error_produced) { max_error_produced = total_error; max_error_string = errorstrings[i]; } } //Add entry for this TreeNode in our dictionary with its max_error_produced max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced)); } return max_error_produced_dictionary; }
// num_bootstraps: the number of bootstrap samples to get // inputs: a list of inputs; each TreeNode represents an entire input range // outputs: a list of outputs; each TreeNode represents a function public static TreeScore DataDebug(int num_bootstraps, DAG dag, Excel.Application app, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, double significance, ProgBar pb) { // this modifies the weights of each node PropagateWeights(dag); // filter out non-terminal functions var output_fns = dag.terminalFormulaNodes(all_outputs); // filter out non-terminal inputs var input_rngs = dag.terminalInputVectors(); // first idx: the index of the TreeNode in the "inputs" array // second idx: the ith bootstrap var resamples = new InputSample[input_rngs.Length][]; // RNG for sampling var rng = new Random(); // we save initial inputs and outputs here var initial_inputs = StoreInputs(input_rngs, dag); var initial_outputs = StoreOutputs(output_fns, dag); // Set progress bar max pb.setMax(input_rngs.Length * 2); #region RESAMPLE // populate bootstrap array // for each input range (a TreeNode) for (int i = 0; i < input_rngs.Length; i++) { // this TreeNode var t = input_rngs[i]; // resample resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng); // update progress bar pb.IncrementProgress(); } #endregion RESAMPLE #region INFERENCE return Inference( num_bootstraps, resamples, initial_inputs, initial_outputs, input_rngs, output_fns, dag, weighted, significance, pb); #endregion INFERENCE }
private static Dictionary<AST.Range, InputSample> StoreInputs(AST.Range[] inputs, DAG dag) { var d = new Dictionary<AST.Range, InputSample>(); foreach (AST.Range input_range in inputs) { var com = dag.getCOMRefForRange(input_range); var s = new InputSample(com.Height, com.Width); // store the entire COM array as a multiarray // in one fell swoop. s.AddArray(com.Range.Value2); // add stored input to dict d.Add(input_range, s); // this is to force excel to recalculate its outputs // exactly the same way that it will for our bootstraps BootMemo.ReplaceExcelRange(com.Range, s); } return d; }
public static Dictionary <AST.Address, string> StoreOutputs(AST.Address[] outputs, DAG dag) { // output dict var d = new Dictionary <AST.Address, string>(); // partition all of the output addresses by their worksheet var addr_groups = outputs.GroupBy(addr => dag.getCOMRefForAddress(addr).WorksheetName); // for each worksheet, do an array read of the formulas foreach (IEnumerable <AST.Address> ws_fns in addr_groups) { // get worksheet used range var fstcr = dag.getCOMRefForAddress(ws_fns.First()); var rng = fstcr.Worksheet.UsedRange; // get used range dimensions var left = rng.Column; var right = rng.Columns.Count + left - 1; var top = rng.Row; var bottom = rng.Rows.Count + top - 1; // get names var wsname = new FSharpOption <string>(fstcr.WorksheetName); var wbname = new FSharpOption <string>(fstcr.WorkbookName); var path = fstcr.Path; // sometimes the used range is a range if (left != right || top != bottom) { // y is the first index // x is the second index object[,] data = rng.Value2; // fast array read var x_del = left - 1; var y_del = top - 1; foreach (AST.Address addr in ws_fns) { // construct address in formulas array var x = addr.X - x_del; var y = addr.Y - y_del; // get string String s = System.Convert.ToString(data[y, x]); if (String.IsNullOrWhiteSpace(s)) { d.Add(addr, ""); } else { d.Add(addr, s); } } } // and other times it is a single cell else { // construct the appropriate AST.Address AST.Address addr = AST.Address.NewFromR1C1(top, left, wsname, wbname, path); // make certain that it is actually a string String s = System.Convert.ToString(rng.Value2); // add to dictionary, as appropriate if (String.IsNullOrWhiteSpace(s)) { d.Add(addr, ""); } else { d.Add(addr, s); } } } return(d); }
public static TreeScore Inference( int num_bootstraps, InputSample[][] resamples, Dictionary <AST.Range, InputSample> initial_inputs, Dictionary <AST.Address, string> initial_outputs, AST.Range[] input_arr, AST.Address[] output_arr, DAG dag, bool weighted, double significance, ProgBar pb) { // synchronization token object lock_token = new Object(); // init thread event notification array var mres = new ManualResetEvent[input_arr.Length]; // init job storage var ddjs = new DataDebugJob[input_arr.Length]; // init started jobs count var sjobs = 0; // init completed jobs count var cjobs = 0; // last-ditch effort flag bool last_try = false; // init score storage var scores = new TreeScore(); for (int i = 0; i < input_arr.Length; i++) { try { #region BOOTSTRAP // bootstrapping is done in the parent STA thread because // the .NET threading model prohibits thread pools (which // are MTA) from accessing STA COM objects directly. // alloc bootstrap storage for each output (f), for each resample (b) FunctionOutput <string>[][] bs = new FunctionOutput <string> [initial_outputs.Count][]; for (int f = 0; f < initial_outputs.Count; f++) { bs[f] = new FunctionOutput <string> [num_bootstraps]; } // init memoization table for input vector i var memo = new BootMemo(); // fetch the input range TreeNode var input = input_arr[i]; // fetch the input range COM object var com = dag.getCOMRefForRange(input).Range; // compute outputs // replace the values of the COM object with the jth bootstrap, // save all function outputs, and // restore the original input for (var b = 0; b < num_bootstraps; b++) { // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them FunctionOutput <string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false); for (var f = 0; f < output_arr.Length; f++) { bs[f][b] = fos[f]; } } // restore the original inputs; faster to do once, after bootstrapping is done BootMemo.ReplaceExcelRange(com, initial_inputs[input]); // TODO: restore formulas if it turns out that they were overwrittern // this should never be the case #endregion BOOTSTRAP #region HYPOTHESIS_TEST // cancellation token mres[i] = new ManualResetEvent(false); // set up job ddjs[i] = new DataDebugJob( dag, bs, initial_outputs, input_arr[i], output_arr, weighted, significance, mres[i] ); sjobs++; // hand job to thread pool ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i); #endregion HYPOTHESIS_TEST // update progress bar pb.IncrementProgress(); } catch (System.OutOfMemoryException e) { if (!last_try) { // If there are no more jobs running, but // we still can't allocate memory, try invoking // GC and then trying again cjobs = mres.Count(mre => mre.WaitOne(0)); if (sjobs - cjobs == 0) { GC.Collect(); last_try = true; } } else { // we just don't have enough memory throw e; } // wait for any of the 0..i-1 work items // to complete and try again WaitHandle.WaitAny(mres.Take(i).ToArray()); } } // Do not proceed until all hypothesis tests are done. // WaitHandle.WaitAll cannot be called on an STA thread which // is why we call WaitOne in a loop. // Merge scores as data becomes available. for (int i = 0; i < input_arr.Length; i++) { mres[i].WaitOne(); scores = DictAdd(scores, ddjs[i].Result); } return(scores); }
private static Dictionary <AST.Range, InputSample> StoreInputs(AST.Range[] inputs, DAG dag) { var d = new Dictionary <AST.Range, InputSample>(); foreach (AST.Range input_range in inputs) { var com = dag.getCOMRefForRange(input_range); var s = new InputSample(com.Height, com.Width); // store the entire COM array as a multiarray // in one fell swoop. s.AddArray(com.Range.Value2); // add stored input to dict d.Add(input_range, s); // this is to force excel to recalculate its outputs // exactly the same way that it will for our bootstraps BootMemo.ReplaceExcelRange(com.Range, s); } return(d); }
// num_bootstraps: the number of bootstrap samples to get // inputs: a list of inputs; each TreeNode represents an entire input range // outputs: a list of outputs; each TreeNode represents a function public static TreeScore DataDebug(int num_bootstraps, DAG dag, Excel.Application app, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, double significance, ProgBar pb) { // this modifies the weights of each node PropagateWeights(dag); // filter out non-terminal functions var output_fns = dag.terminalFormulaNodes(all_outputs); // filter out non-terminal inputs var input_rngs = dag.terminalInputVectors(); // first idx: the index of the TreeNode in the "inputs" array // second idx: the ith bootstrap var resamples = new InputSample[input_rngs.Length][]; // RNG for sampling var rng = new Random(); // we save initial inputs and outputs here var initial_inputs = StoreInputs(input_rngs, dag); var initial_outputs = StoreOutputs(output_fns, dag); // Set progress bar max pb.setMax(input_rngs.Length * 2); #region RESAMPLE // populate bootstrap array // for each input range (a TreeNode) for (int i = 0; i < input_rngs.Length; i++) { // this TreeNode var t = input_rngs[i]; // resample resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng); // update progress bar pb.IncrementProgress(); } #endregion RESAMPLE #region INFERENCE return(Inference( num_bootstraps, resamples, initial_inputs, initial_outputs, input_rngs, output_fns, dag, weighted, significance, pb)); #endregion INFERENCE }
public void Analyze(long max_duration_in_ms) { var sw = new System.Diagnostics.Stopwatch(); sw.Start(); using (var pb = new ProgBar()) { // Disable screen updating during analysis to speed things up _app.ScreenUpdating = false; // Build dependency graph (modifies data) try { _dag = new DAG(_app.ActiveWorkbook, _app, IGNORE_PARSE_ERRORS); var num_input_cells = _dag.numberOfInputCells(); } catch (ExcelParserUtility.ParseException e) { // cleanup UI and then rethrow _app.ScreenUpdating = true; throw e; } if (_dag.terminalInputVectors().Length == 0) { System.Windows.Forms.MessageBox.Show("This spreadsheet contains no vector-input functions."); _app.ScreenUpdating = true; _flaggable = new KeyValuePair<AST.Address, int>[0]; return; } // Get bootstraps var scores = Analysis.DataDebug(NBOOTS, _dag, _app, weighted: USE_WEIGHTS, all_outputs: CONSIDER_ALL_OUTPUTS, max_duration_in_ms: max_duration_in_ms, sw: sw, significance: _tool_significance, pb: pb) .OrderByDescending(pair => pair.Value).ToArray(); if (_debug_mode) { var score_str = String.Join("\n", scores.Take(10).Select(score => score.Key.A1FullyQualified() + " -> " + score.Value.ToString())); System.Windows.Forms.MessageBox.Show(score_str); System.Windows.Forms.Clipboard.SetText(score_str); } List<KeyValuePair<AST.Address, int>> high_scores = new List<KeyValuePair<AST.Address, int>>(); // calculate cutoff idnex int thresh = scores.Length - Convert.ToInt32(scores.Length * _tool_significance); // filter out cells that are... _flaggable = scores.Where(pair => pair.Value >= scores[thresh].Value) // below threshold .Where(pair => !_known_good.Contains(pair.Key)) // known to be good .Where(pair => pair.Value != 0).ToArray(); // score == 0 // Enable screen updating when we're done _app.ScreenUpdating = true; sw.Stop(); } }
private static int PropagateNodeWeight(AST.Address node, DAG dag) { // if the node is a formula, recursively // compute its weight if (dag.isFormula(node)) { // get input nodes var vector_rngs = dag.getFormulaInputVectors(node); var scinputs = dag.getFormulaSingleCellInputs(node); var inputs = vector_rngs.SelectMany(vrng => vrng.Addresses()).ToList(); inputs.AddRange(scinputs); // call recursively and sum components var weight = 0; foreach (var input in inputs) { weight += PropagateNodeWeight(input, dag); } dag.setWeight(node, weight); return weight; } // node is an input else { dag.setWeight(node, 1); return 1; } }
// save all of the values of the spreadsheet that // participate in any computation public static CellDict SaveInputs(DAG dag) { try { var cd = new CellDict(); foreach (var addr in dag.allComputationCells()) { cd.Add(addr, dag.readCOMValueAtAddress(addr)); } return cd; } catch (Exception e) { throw new Exception(String.Format("Failed in SaveInputs: {0}", e.Message)); } }
public DataDebugJob( DAG dag, FunctionOutput<String>[][] bs, Dictionary<AST.Address, string> initial_outputs, AST.Range input, AST.Address[] output_arr, bool weighted, double significance, ManualResetEvent mre) { _dag = dag; _bs = bs; _initial_outputs = initial_outputs; _input = input; _outputs = output_arr; _weighted = weighted; _significance = significance; _mre = mre; _score = new TreeScore(); }
// Get dictionary of inputs and the error they produce public static CellDict GenImportantErrors(AST.Address[] output_nodes, CellDict inputs, int k, // number of alternatives to consider CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, Classification c, DAG dag) { var eg = new ErrorGenerator(); var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>(); foreach (KeyValuePair<AST.Address, string> pair in inputs) { AST.Address addr = pair.Key; string orig_value = pair.Value; //Load in the classification's dictionaries double max_error_produced = 0.0; string max_error_string = ""; // get k strings string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k); for (int i = 0; i < k; i++) { CellDict cd = new CellDict(); cd.Add(addr, errorstrings[i]); //inject the typo InjectValues(app, wb, cd); // save function outputs CellDict incorrect_outputs = SaveOutputs(output_nodes, dag); //remove the typo that was introduced cd.Clear(); cd.Add(addr, orig_value); InjectValues(app, wb, cd); double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs); //keep track of the largest observed max error if (total_error > max_error_produced) { max_error_produced = total_error; max_error_string = errorstrings[i]; } } //Add entry for this TreeNode in our dictionary with its max_error_produced max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced)); } // sort by max_error_produced var maxen = max_error_produced_dictionary.OrderByDescending(pair => pair.Value.Item2).Select(pair => new Tuple<AST.Address, string>(pair.Key, pair.Value.Item1)).ToList(); return maxen.Take((int)Math.Ceiling(0.05 * inputs.Count)).ToDictionary(tup => tup.Item1, tup => tup.Item2); }
// returns the number of cells inspected public int Run(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test CutoffKind ck, // kind of threshold function to use Excel.Application app, // reference to Excel app Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, Excel.Workbook wb, AST.Address[] terminal_formula_cells, AST.Range[] terminal_input_vectors, CellDict original_inputs, CellDict correct_outputs, long max_duration_in_ms, String logfile, //filename for the output log ProgBar pb ) { //set wbname and path _wb_name = xlfile; _wb_path = wb.Path; _analysis_type = analysisType; _significance = significance; _all_outputs = all_outputs; _weighted = weighted; //Now we want to inject the errors from _errors Utility.InjectValues(app, wb, _errors); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); //Time the removal of errors Stopwatch sw = new Stopwatch(); sw.Start(); // remove errors until none remain; MODIFIES WORKBOOK _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb); sw.Stop(); TimeSpan elapsed = sw.Elapsed; _analysis_time = elapsed.TotalSeconds; // save partially-corrected outputs var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); // compute total relative error _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors); _total_relative_error = Utility.TotalRelativeError(_error); // compute starting total relative error (normalized by max_errors) ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors); _initial_total_relative_error = Utility.TotalRelativeError(starting_error); // effort _max_effort = dag.allCells().Length; _effort = (_user.true_positives.Count + _user.false_positives.Count); _expended_effort = (double)_effort / (double)_max_effort; // compute average precision // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives| // where P(k) is the precision at threshold k, // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count; // restore original values Utility.InjectValues(app, wb, original_inputs); _tree_construct_time = dag.AnalysisMilliseconds / 1000.0; // flag that we're done; safe to print output results _simulation_run = true; // return the number of cells inspected return _effort; }
public static AST.Address NormalPerRange_Step(DAG dag, Excel.Workbook wb, HashSet<AST.Address> known_good, long max_duration_in_ms, Stopwatch sw) { AST.Address flagged_cell = null; //Generate normal distributions for every input range until an error is found //Then break out of the loop and report it. foreach (var vect_addr in dag.allVectors()) { var normal_dist = new DataDebugMethods.NormalDistribution(dag.getCOMRefForRange(vect_addr).Range); // Get top outlier which has not been inspected already if (normal_dist.getErrorsCount() > 0) { for (int i = 0; i < normal_dist.getErrorsCount(); i++) { // check for timeout if (sw.ElapsedMilliseconds > max_duration_in_ms) { throw new TimeoutException("Timeout exception in NormalPerRange_Step."); } var flagged_com = normal_dist.getErrorAtPosition(i); flagged_cell = AST.Address.AddressFromCOMObject(flagged_com, wb); if (known_good.Contains(flagged_cell)) { flagged_cell = null; } else { break; } } } //If a cell is flagged, do not move on to the next range (if you do, you'll overwrite the flagged_cell if (flagged_cell != null) { break; } } return flagged_cell; }
// save spreadsheet outputs to a CellDict public static CellDict SaveOutputs(AST.Address[] formula_nodes, DAG dag) { var cd = new CellDict(); foreach (AST.Address formula_addr in formula_nodes) { // throw an exception in debug mode, because this should never happen #if DEBUG if (!(bool)(dag.getCOMRefForAddress(formula_addr).Range.HasFormula)) { String fstring = dag.getFormulaAtAddress(formula_addr); throw new Exception("Formula address is not a formula."); } #endif // save value if (cd.ContainsKey(formula_addr)) { throw new Exception(String.Format("Failed in SaveOutputs.")); } else { cd.Add(formula_addr, dag.readCOMValueAtAddress(formula_addr)); } } return cd; }
// For running a simulation from the batch runner // returns the number of cells inspected public int RunFromBatch(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test Excel.Application app, // reference to Excel app CutoffKind ck, Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, // the computation tree of the spreadsheet Excel.Workbook wb, // the workbook being analyzed CellDict errors, // the errors that will be introduced in the spreadsheet AST.Range[] terminal_input_vectors, // the inputs AST.Address[] terminal_formula_cells, // the outputs CellDict original_inputs, // original values of the inputs CellDict correct_outputs, // the correct outputs long max_duration_in_ms, String logfile //filename for the output log ) { if (terminal_input_vectors.Length == 0) { throw new NoRangeInputs(); } if (original_inputs.Count() == 0) { throw new NoFormulas(); } _errors = errors; // find the error with the largest magnitude // this is mostly useful for the single-perturbation experiments var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key])) ).Max() : 0; _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key]) ).Max() : 0; // find the output with the largest magnitude var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value)); var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value)); _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key])) ).Max() : 0; _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key]) ).Max() : 0; return Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null); }
public void TestGetFormulaRanges() { var mwb = new MockWorkbook(); // rnd, for random formulae assignment Random rand = new Random(); // gin up some formulae Tuple<string, string>[] fs = {new Tuple<string,string>("B4", "=COUNT(A1:A5)"), new Tuple<string,string>("A6", "=SUM(B5:B40)"), new Tuple<string,string>("Z2", "=AVERAGE(A1:E1)"), new Tuple<string,string>("B44", "=MEDIAN(D4:D9)")}; // to keep track of what we did var d = new System.Collections.Generic.Dictionary<Excel.Worksheet, System.Collections.Generic.List<Tuple<string, string>>>(); // add the formulae to the worksheets, randomly foreach (Excel.Worksheet w in mwb.GetWorksheets()) { // init list for each worksheet d[w] = new System.Collections.Generic.List<Tuple<string, string>>(); // add the formulae, randomly foreach (var f in fs) { if (rand.Next(0, 2) == 0) { w.Range[f.Item1, f.Item1].Formula = f.Item2; // keep track of what we did d[w].Add(f); } } // we need at least one formula, so add one if the above procedure did not if (d[w].Count() == 0) { w.Range[fs[0].Item1, fs[0].Item1].Formula = fs[0].Item2; d[w].Add(fs[0]); } } // init DAG var dag = new DAG(mwb.GetWorkbook(), mwb.GetApplication(), false); // get the formulas; 1 formula per worksheet var formulas = dag.getAllFormulaAddrs(); // there should be e.Count + 3 entries // don't forget: workbooks have 3 blank worksheets by default var expected = d.Values.Select(v => v.Count).Aggregate((acc, c) => acc + c); if (formulas.Length != expected) { throw new Exception("DAG.getAllFormulaAddrs() should return " + expected + " elements but instead returns " + formulas.Length + "."); } bool all_ok = true; // make sure that each worksheet's range has the formulas that it should var f_wsgroups = formulas.GroupBy(f => f.GetCOMObject(mwb.GetApplication()).Worksheet); foreach (var pair in f_wsgroups) { // get formulas in this worksheet var r = pair.Key.UsedRange.SpecialCells(Excel.XlCellType.xlCellTypeFormulas); // check that all formulae for this worksheet are accounted for bool r_ok = d[r.Worksheet].Aggregate(true, (bool acc, Tuple<string, string> f) => { bool found = false; foreach (Excel.Range cell in r) { if (String.Equals((string)cell.Formula, f.Item2)) { found = true; } } return acc && found; }); all_ok = all_ok && r_ok; } if (!all_ok) { throw new Exception("ConstructTree.GetFormulaRanges() failed to return all of the formulae that were added."); } }
// remove errors until none remain private UserResults SimulateUser(int nboots, double significance, CutoffKind ck, DAG dag, CellDict original_inputs, CellDict errord, CellDict correct_outputs, Excel.Workbook wb, Excel.Application app, AnalysisType analysis_type, bool weighted, bool all_outputs, long max_duration_in_ms, Stopwatch sw, String logfile, ProgBar pb ) { // init user results data structure var o = new UserResults(); HashSet<AST.Address> known_good = new HashSet<AST.Address>(); // initialize procedure var errors_remain = true; var max_errors = new ErrorDict(); var incorrect_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag); var errors_found = 0; var number_of_true_errors = errord.Count; Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors); // the corrected state of the spreadsheet CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value); // remove errors loop var cells_inspected = 0; List<KeyValuePair<AST.Address, int>> filtered_high_scores = null; bool correction_made = true; while (errors_remain) { Console.Write("."); AST.Address flagged_cell = null; // choose the appropriate test if (analysis_type == AnalysisType.CheckCell5 || analysis_type == AnalysisType.CheckCell10 ) { flagged_cell = SimulationStep.CheckCell_Step(o, significance, ck, nboots, dag, app, weighted, all_outputs, correction_made, known_good, ref filtered_high_scores, max_duration_in_ms, sw, pb); } else if (analysis_type == AnalysisType.NormalPerRange) { flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw); } else if (analysis_type == AnalysisType.NormalAllInputs) { flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw); } // stop if the test no longer returns anything or if // the test is simply done inspecting based on a fixed threshold if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected)) { errors_remain = false; } else // a cell was flagged { //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged, //then nothing is inspected, so cells_inspected doesn't increase. cells_inspected += 1; // check to see if the flagged value is actually an error if (errord.ContainsKey(flagged_cell)) { correction_made = true; errors_found += 1; // P(k) * rel(k) o.PrecRel_at_k.Add(errors_found / (double)cells_inspected); o.true_positives.Add(flagged_cell); // correct flagged cell flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell]; Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors); // compute total error after applying this correction var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs); o.current_total_error.Add(current_total_error); // save outputs partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag); } else { correction_made = false; // numerator is 0 here because rel(k) = 0 when no error was found o.PrecRel_at_k.Add(0.0); o.false_positives.Add(flagged_cell); } // mark it as known good -- at this point the cell has been // 'inspected' regardless of whether it was an error // It was either corrected or marked as OK known_good.Add(flagged_cell); // compute output error magnitudes var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs); // compute input error magnitude double num_input_error_magnitude; double str_input_error_magnitude; if (errord.ContainsKey(flagged_cell)) { if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell])) { num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell])); str_input_error_magnitude = 0; } else { num_input_error_magnitude = 0; str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]); } } else { num_input_error_magnitude = 0; str_input_error_magnitude = 0; } // write error log var logentry = new LogEntry(analysis_type, wb.Name, flagged_cell, original_inputs[flagged_cell], errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell], output_error_magnitude, num_input_error_magnitude, str_input_error_magnitude, true, correction_made, significance, ck.Threshold); logentry.WriteLog(logfile); _error_log.Add(logentry); } } // find all of the false negatives o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord); o.max_errors = max_errors; var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs); // write out all false negative information foreach (AST.Address fn in o.false_negatives) { double num_input_error_magnitude; double str_input_error_magnitude; if (Utility.BothNumbers(errord[fn], original_inputs[fn])) { num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn])); str_input_error_magnitude = 0; } else { num_input_error_magnitude = 0; str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]); } // write error log _error_log.Add(new LogEntry(analysis_type, wb.Name, fn, original_inputs[fn], errord[fn], last_out_err_mag, num_input_error_magnitude, str_input_error_magnitude, false, true, significance, ck.Threshold)); } return o; }