Exemple #1
0
        public FunctionOutput <string>[] FastReplace(Excel.Range com, DAG dag, InputSample original, InputSample sample, AST.Address[] outputs, bool replace_original)
        {
            FunctionOutput <string>[] fo_arr;
            if (!_d.TryGetValue(sample, out fo_arr))
            {
                // replace the COM value
                ReplaceExcelRange(com, sample);

                // initialize array
                fo_arr = new FunctionOutput <string> [outputs.Length];

                // grab all outputs
                for (var k = 0; k < outputs.Length; k++)
                {
                    // save the output
                    fo_arr[k] = new FunctionOutput <string>(dag.readCOMValueAtAddress(outputs[k]), sample.GetExcludes());
                }

                // Add function values to cache
                // Don't care about return value
                _d.Add(sample, fo_arr);

                // restore the COM value
                if (replace_original)
                {
                    ReplaceExcelRange(com, original);
                }
            }
            return(fo_arr);
        }
Exemple #2
0
        private static int PropagateNodeWeight(AST.Address node, DAG dag)
        {
            // if the node is a formula, recursively
            // compute its weight
            if (dag.isFormula(node))
            {
                // get input nodes
                var vector_rngs = dag.getFormulaInputVectors(node);
                var scinputs    = dag.getFormulaSingleCellInputs(node);
                var inputs      = vector_rngs.SelectMany(vrng => vrng.Addresses()).ToList();
                inputs.AddRange(scinputs);

                // call recursively and sum components
                var weight = 0;
                foreach (var input in inputs)
                {
                    weight += PropagateNodeWeight(input, dag);
                }
                dag.setWeight(node, weight);
                return(weight);
            }
            // node is an input
            else
            {
                dag.setWeight(node, 1);
                return(1);
            }
        }
Exemple #3
0
        public FunctionOutput<string>[] FastReplace(Excel.Range com, DAG dag, InputSample original, InputSample sample, AST.Address[] outputs, bool replace_original)
        {
            FunctionOutput<string>[] fo_arr;
            if (!_d.TryGetValue(sample, out fo_arr))
            {
                // replace the COM value
                ReplaceExcelRange(com, sample);

                // initialize array
                fo_arr = new FunctionOutput<string>[outputs.Length];

                // grab all outputs
                for (var k = 0; k < outputs.Length; k++)
                {
                    // save the output
                    fo_arr[k] = new FunctionOutput<string>(dag.readCOMValueAtAddress(outputs[k]), sample.GetExcludes());
                }

                // Add function values to cache
                // Don't care about return value
                _d.Add(sample, fo_arr);

                // restore the COM value
                if (replace_original)
                {
                    ReplaceExcelRange(com, original);
                }
            }
            return fo_arr;
        }
Exemple #4
0
        public static TreeScore NumericHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput <string>[] boots, string initial_output, bool weighted, double significance)
        {
            // this function's input cells
            var input_cells = rangeNode.Addresses();

            var inputs_sz = input_cells.Count();

            // scores
            var input_exclusion_scores = new TreeScore();

            // convert to numeric
            var numeric_boots = ConvertToNumericOutput(boots);

            // sort
            var sorted_num_boots = SortBootstraps(numeric_boots);

            // for each excluded index, test whether the original input
            // falls outside our bootstrap confidence bounds
            for (int i = 0; i < inputs_sz; i++)
            {
                // default weight
                int weight = 1;

                // add weight to score if test fails
                AST.Address xtree = input_cells[i];
                if (weighted)
                {
                    // the weight of the function value of interest
                    weight = dag.getWeight(functionNode);
                }

                double outlieriness = RejectNullHypothesis(sorted_num_boots, initial_output, i, significance);

                if (outlieriness != 0.0)
                {
                    // get the xth indexed input in input_rng i
                    if (input_exclusion_scores.ContainsKey(xtree))
                    {
                        input_exclusion_scores[xtree] += (int)(weight * outlieriness);
                    }
                    else
                    {
                        input_exclusion_scores.Add(xtree, (int)(weight * outlieriness));
                    }
                }
                else
                {
                    // we need to at least add the value to the tree
                    if (!input_exclusion_scores.ContainsKey(xtree))
                    {
                        input_exclusion_scores.Add(xtree, 0);
                    }
                }
            }
            return(input_exclusion_scores);
        }
Exemple #5
0
        // Propagate weights
        private static void PropagateWeights(DAG dag)
        {
            if (dag.containsLoop())
            {
                throw new ContainsLoopException();
            }

            // starting set of functions; roots in the forest
            var formulas = dag.terminalFormulaNodes(false);

            // for each forest
            foreach (AST.Address f in formulas)
            {
                dag.setWeight(f, PropagateNodeWeight(f, dag));
            }
        }
Exemple #6
0
        public static TreeScore StringHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput <string>[] boots, string initial_output, bool weighted, double significance)
        {
            // this function's input cells
            var input_cells = rangeNode.Addresses();

            // scores
            var iexc_scores = new TreeScore();

            var inputs_sz = input_cells.Count();

            // exclude each index, in turn
            for (int i = 0; i < inputs_sz; i++)
            {
                // default weight
                int weight = 1;

                // add weight to score if test fails
                AST.Address xtree = input_cells[i];
                if (weighted)
                {
                    // the weight of the function value of interest
                    weight = dag.getWeight(functionNode);
                }

                if (RejectNullHypothesis(boots, initial_output, i, significance))
                {
                    if (iexc_scores.ContainsKey(xtree))
                    {
                        iexc_scores[xtree] += weight;
                    }
                    else
                    {
                        iexc_scores.Add(xtree, weight);
                    }
                }
                else
                {
                    // we need to at least add the value to the tree
                    if (!iexc_scores.ContainsKey(xtree))
                    {
                        iexc_scores.Add(xtree, 0);
                    }
                }
            }

            return(iexc_scores);
        }
Exemple #7
0
        public static PrepData PrepSimulation(Excel.Application app, Excel.Workbook wbh, ProgBar pb, bool ignore_parse_errors)
        {
            // build graph
            var dag = new DAG(wbh, app, ignore_parse_errors);
            if (dag.containsLoop())
            {
                throw new DataDebugMethods.ContainsLoopException();
            }
            pb.IncrementProgress();

            // get terminal input and terminal formula nodes once
            var terminal_input_nodes = dag.terminalInputVectors();
            var terminal_formula_nodes = dag.terminalFormulaNodes(true);  ///the boolean indicates whether to use all outputs or not

            if (terminal_input_nodes.Length == 0)
            {
                throw new NoRangeInputs();
            }

            if (terminal_formula_nodes.Length == 0)
            {
                throw new NoFormulas();
            }

            // save original spreadsheet state
            CellDict original_inputs = UserSimulation.Utility.SaveInputs(dag);

            // force a recalculation before saving outputs, otherwise we may
            // erroneously conclude that the procedure did the wrong thing
            // based solely on Excel floating-point oddities
            UserSimulation.Utility.InjectValues(app, wbh, original_inputs);

            // save function outputs
            CellDict correct_outputs = UserSimulation.Utility.SaveOutputs(terminal_formula_nodes, dag);

            return new PrepData()
            {
                dag = dag,
                original_inputs = original_inputs,
                correct_outputs = correct_outputs,
                terminal_input_nodes = terminal_input_nodes,
                terminal_formula_nodes = terminal_formula_nodes
            };
        }
        // this function returns an address but also updates
        // the filtered_high_scores list
        public static AST.Address CheckCell_Step(UserResults o,
            double significance,
            CutoffKind ck,
            int nboots,
            DAG dag,
            Excel.Application app,
            bool weighted,
            bool all_outputs,
            bool run_bootstrap,
            HashSet<AST.Address> known_good,
            ref List<KeyValuePair<AST.Address, int>> filtered_high_scores,
            long max_duration_in_ms,
            Stopwatch sw,
            ProgBar pb)
        {
            // Get bootstraps
            // The bootstrap should only re-run if there is a correction made,
            //      not when something is marked as OK (isn't one of the introduced errors)
            // The list of suspected cells doesn't change when we mark something as OK,
            //      we just move on to the next thing in the list
            if (run_bootstrap)
            {
                TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb);

                // apply a threshold to the scores
                filtered_high_scores = ck.applyCutoff(scores, known_good);
            }
            else  //if no corrections were made (a cell was marked as OK, not corrected)
            {
                //re-filter out cells marked as OK
                filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList();
            }

            if (filtered_high_scores.Count() != 0)
            {
                // get AST.Address corresponding to most unusual score
                return filtered_high_scores[0].Key;
            }
            else
            {
                return null;
            }
        }
Exemple #9
0
            private TreeScore _score; // dict of exclusion scores for each input CELL TreeNode

            public DataDebugJob(
                DAG dag,
                FunctionOutput <String>[][] bs,
                Dictionary <AST.Address, string> initial_outputs,
                AST.Range input,
                AST.Address[] output_arr,
                bool weighted,
                double significance,
                ManualResetEvent mre)
            {
                _dag             = dag;
                _bs              = bs;
                _initial_outputs = initial_outputs;
                _input           = input;
                _outputs         = output_arr;
                _weighted        = weighted;
                _significance    = significance;
                _mre             = mre;
                _score           = new TreeScore();
            }
        public static AST.Address NormalAllOutputs_Step(DAG dag,
            Excel.Application app,
            Excel.Workbook wb,
            HashSet<AST.Address> known_good,
            long max_duration_in_ms,
            Stopwatch sw)
        {
            AST.Address flagged_cell = null;

            //Generate a normal distribution for the entire set of inputs
            var normal_dist = new DataDebugMethods.NormalDistribution(dag.terminalInputVectors(), app);

            // Get top outlier
            if (normal_dist.getErrorsCount() > 0)
            {
                for (int i = 0; i < normal_dist.getErrorsCount(); i++)
                {
                    // check for timeout
                    if (sw.ElapsedMilliseconds > max_duration_in_ms)
                    {
                        throw new TimeoutException("Timeout exception in NormalAllOutputs_Step.");
                    }

                    var flagged_com = normal_dist.getErrorAtPosition(i);
                    flagged_cell = AST.Address.AddressFromCOMObject(flagged_com, wb);
                    if (known_good.Contains(flagged_cell))
                    {
                        flagged_cell = null;
                    }
                    else
                    {
                        break;
                    }
                }
            }

            return flagged_cell;
        }
Exemple #11
0
        public static TreeScore StringHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput<string>[] boots, string initial_output, bool weighted, double significance)
        {
            // this function's input cells
            var input_cells = rangeNode.Addresses();

            // scores
            var iexc_scores = new TreeScore();

            var inputs_sz = input_cells.Count();

            // exclude each index, in turn
            for (int i = 0; i < inputs_sz; i++)
            {
                // default weight
                int weight = 1;

                // add weight to score if test fails
                AST.Address xtree = input_cells[i];
                if (weighted)
                {
                    // the weight of the function value of interest
                    weight = dag.getWeight(functionNode);
                }

                if (RejectNullHypothesis(boots, initial_output, i, significance))
                {

                    if (iexc_scores.ContainsKey(xtree))
                    {
                        iexc_scores[xtree] += weight;
                    }
                    else
                    {
                        iexc_scores.Add(xtree, weight);
                    }
                }
                else
                {
                    // we need to at least add the value to the tree
                    if (!iexc_scores.ContainsKey(xtree))
                    {
                        iexc_scores.Add(xtree, 0);
                    }
                }
            }

            return iexc_scores;
        }
Exemple #12
0
        public static Dictionary<AST.Address, string> StoreOutputs(AST.Address[] outputs, DAG dag)
        {
            // output dict
            var d = new Dictionary<AST.Address, string>();

            // partition all of the output addresses by their worksheet
            var addr_groups = outputs.GroupBy(addr => dag.getCOMRefForAddress(addr).WorksheetName);

            // for each worksheet, do an array read of the formulas
            foreach (IEnumerable<AST.Address> ws_fns in addr_groups)
            {
                // get worksheet used range
                var fstcr = dag.getCOMRefForAddress(ws_fns.First());
                var rng = fstcr.Worksheet.UsedRange;

                // get used range dimensions
                var left = rng.Column;
                var right = rng.Columns.Count + left - 1;
                var top = rng.Row;
                var bottom = rng.Rows.Count + top - 1;

                // get names
                var wsname = new FSharpOption<string>(fstcr.WorksheetName);
                var wbname = new FSharpOption<string>(fstcr.WorkbookName);
                var path = fstcr.Path;

                // sometimes the used range is a range
                if (left != right || top != bottom)
                {
                    // y is the first index
                    // x is the second index
                    object[,] data = rng.Value2;    // fast array read

                    var x_del = left - 1;
                    var y_del = top - 1;

                    foreach (AST.Address addr in ws_fns)
                    {
                        // construct address in formulas array
                        var x = addr.X - x_del;
                        var y = addr.Y - y_del;

                        // get string
                        String s = System.Convert.ToString(data[y, x]);
                        if (String.IsNullOrWhiteSpace(s))
                        {
                            d.Add(addr, "");
                        }
                        else
                        {
                            d.Add(addr, s);
                        }
                    }
                }
                // and other times it is a single cell
                else
                {
                    // construct the appropriate AST.Address
                    AST.Address addr = AST.Address.NewFromR1C1(top, left, wsname, wbname, path);

                    // make certain that it is actually a string
                    String s = System.Convert.ToString(rng.Value2);

                    // add to dictionary, as appropriate
                    if (String.IsNullOrWhiteSpace(s))
                    {
                        d.Add(addr, "");
                    }
                    else
                    {
                        d.Add(addr, s);
                    }
                }
            }

            return d;
        }
Exemple #13
0
        public static TreeScore NumericHypothesisTest(DAG dag, AST.Range rangeNode, AST.Address functionNode, FunctionOutput<string>[] boots, string initial_output, bool weighted, double significance)
        {
            // this function's input cells
            var input_cells = rangeNode.Addresses();

            var inputs_sz = input_cells.Count();

            // scores
            var input_exclusion_scores = new TreeScore();

            // convert to numeric
            var numeric_boots = ConvertToNumericOutput(boots);

            // sort
            var sorted_num_boots = SortBootstraps(numeric_boots);

            // for each excluded index, test whether the original input
            // falls outside our bootstrap confidence bounds
            for (int i = 0; i < inputs_sz; i++)
            {
                // default weight
                int weight = 1;

                // add weight to score if test fails
                AST.Address xtree = input_cells[i];
                if (weighted)
                {
                    // the weight of the function value of interest
                    weight = dag.getWeight(functionNode);
                }

                double outlieriness = RejectNullHypothesis(sorted_num_boots, initial_output, i, significance);

                if (outlieriness != 0.0)
                {
                    // get the xth indexed input in input_rng i
                    if (input_exclusion_scores.ContainsKey(xtree))
                    {
                        input_exclusion_scores[xtree] += (int)(weight * outlieriness);
                    }
                    else
                    {
                        input_exclusion_scores.Add(xtree, (int)(weight * outlieriness));
                    }
                }
                else
                {
                    // we need to at least add the value to the tree
                    if (!input_exclusion_scores.ContainsKey(xtree))
                    {
                        input_exclusion_scores.Add(xtree, 0);
                    }
                }
            }
            return input_exclusion_scores;
        }
Exemple #14
0
        public static TreeScore Inference(
            int num_bootstraps,
            InputSample[][] resamples,
            Dictionary<AST.Range, InputSample> initial_inputs,
            Dictionary<AST.Address, string> initial_outputs,
            AST.Range[] input_arr,
            AST.Address[] output_arr,
            DAG dag,
            bool weighted,
            double significance,
            ProgBar pb)
        {
            // synchronization token
            object lock_token = new Object();

            // init thread event notification array
            var mres = new ManualResetEvent[input_arr.Length];

            // init job storage
            var ddjs = new DataDebugJob[input_arr.Length];

            // init started jobs count
            var sjobs = 0;

            // init completed jobs count
            var cjobs = 0;

            // last-ditch effort flag
            bool last_try = false;

            // init score storage
            var scores = new TreeScore();

            for (int i = 0; i < input_arr.Length; i++)
            {
                try
                {
                    #region BOOTSTRAP
                    // bootstrapping is done in the parent STA thread because
                    // the .NET threading model prohibits thread pools (which
                    // are MTA) from accessing STA COM objects directly.

                    // alloc bootstrap storage for each output (f), for each resample (b)
                    FunctionOutput<string>[][] bs = new FunctionOutput<string>[initial_outputs.Count][];
                    for (int f = 0; f < initial_outputs.Count; f++)
                    {
                        bs[f] = new FunctionOutput<string>[num_bootstraps];
                    }

                    // init memoization table for input vector i
                    var memo = new BootMemo();

                    // fetch the input range TreeNode
                    var input = input_arr[i];

                    // fetch the input range COM object
                    var com = dag.getCOMRefForRange(input).Range;

                    // compute outputs
                    // replace the values of the COM object with the jth bootstrap,
                    // save all function outputs, and
                    // restore the original input
                    for (var b = 0; b < num_bootstraps; b++)
                    {
                        // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them
                        FunctionOutput<string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false);
                        for (var f = 0; f < output_arr.Length; f++)
                        {
                            bs[f][b] = fos[f];
                        }
                    }

                    // restore the original inputs; faster to do once, after bootstrapping is done
                    BootMemo.ReplaceExcelRange(com, initial_inputs[input]);

                    // TODO: restore formulas if it turns out that they were overwrittern
                    //       this should never be the case
                    #endregion BOOTSTRAP

                    #region HYPOTHESIS_TEST
                    // cancellation token
                    mres[i] = new ManualResetEvent(false);

                    // set up job
                    ddjs[i] = new DataDebugJob(
                                dag,
                                bs,
                                initial_outputs,
                                input_arr[i],
                                output_arr,
                                weighted,
                                significance,
                                mres[i]
                                );

                    sjobs++;

                    // hand job to thread pool
                    ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i);
                    #endregion HYPOTHESIS_TEST

                    // update progress bar
                    pb.IncrementProgress();
                }
                catch (System.OutOfMemoryException e)
                {
                    if (!last_try)
                    {
                        // If there are no more jobs running, but
                        // we still can't allocate memory, try invoking
                        // GC and then trying again
                        cjobs = mres.Count(mre => mre.WaitOne(0));
                        if (sjobs - cjobs == 0)
                        {
                            GC.Collect();
                            last_try = true;
                        }
                    }
                    else
                    {
                        // we just don't have enough memory
                        throw e;
                    }

                    // wait for any of the 0..i-1 work items
                    // to complete and try again
                    WaitHandle.WaitAny(mres.Take(i).ToArray());
                }
            }

            // Do not proceed until all hypothesis tests are done.
            // WaitHandle.WaitAll cannot be called on an STA thread which
            // is why we call WaitOne in a loop.
            // Merge scores as data becomes available.
            for (int i = 0; i < input_arr.Length; i++)
            {
                mres[i].WaitOne();
                scores = DictAdd(scores, ddjs[i].Result);
            }

            return scores;
        }
Exemple #15
0
        // Get dictionary of inputs and the error they produce
        public Dictionary<AST.Address, Tuple<string, double>> TopOfKErrors(AST.Address[] terminal_formula_nodes, CellDict inputs, int k, CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, string classification_file, DAG dag)
        {
            var eg = new ErrorGenerator();
            var c = Classification.Deserialize(classification_file);
            var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>();

            foreach (KeyValuePair<AST.Address,string> pair in inputs)
            {
                AST.Address addr = pair.Key;
                string orig_value = pair.Value;

                //Load in the classification's dictionaries
                double max_error_produced = 0.0;
                string max_error_string = "";

                // get k strings, in parallel
                string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k);

                for (int i = 0; i < k; i++)
                {
                    CellDict cd = new CellDict();
                    cd.Add(addr, errorstrings[i]);
                    //inject the typo
                    Utility.InjectValues(app, wb, cd);

                    // save function outputs
                    CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_nodes, dag);

                    //remove the typo that was introduced
                    cd.Clear();
                    cd.Add(addr, orig_value);
                    Utility.InjectValues(app, wb, cd);

                    double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs);

                    //keep track of the largest observed max error
                    if (total_error > max_error_produced)
                    {
                        max_error_produced = total_error;
                        max_error_string = errorstrings[i];
                    }
                }
                //Add entry for this TreeNode in our dictionary with its max_error_produced
                max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced));
            }
            return max_error_produced_dictionary;
        }
Exemple #16
0
        // num_bootstraps: the number of bootstrap samples to get
        // inputs: a list of inputs; each TreeNode represents an entire input range
        // outputs: a list of outputs; each TreeNode represents a function
        public static TreeScore DataDebug(int num_bootstraps,
            DAG dag,
            Excel.Application app,
            bool weighted,
            bool all_outputs,
            long max_duration_in_ms,
            Stopwatch sw,
            double significance,
            ProgBar pb)
        {
            // this modifies the weights of each node
            PropagateWeights(dag);

            // filter out non-terminal functions
            var output_fns = dag.terminalFormulaNodes(all_outputs);
            // filter out non-terminal inputs
            var input_rngs = dag.terminalInputVectors();

            // first idx: the index of the TreeNode in the "inputs" array
            // second idx: the ith bootstrap
            var resamples = new InputSample[input_rngs.Length][];

            // RNG for sampling
            var rng = new Random();

            // we save initial inputs and outputs here
            var initial_inputs = StoreInputs(input_rngs, dag);
            var initial_outputs = StoreOutputs(output_fns, dag);

            // Set progress bar max
            pb.setMax(input_rngs.Length * 2);

            #region RESAMPLE

            // populate bootstrap array
            // for each input range (a TreeNode)
            for (int i = 0; i < input_rngs.Length; i++)
            {
                // this TreeNode
                var t = input_rngs[i];

                // resample
                resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng);

                // update progress bar
                pb.IncrementProgress();
            }

            #endregion RESAMPLE

            #region INFERENCE
            return Inference(
                num_bootstraps,
                resamples,
                initial_inputs,
                initial_outputs,
                input_rngs,
                output_fns,
                dag,
                weighted,
                significance,
                pb);
            #endregion INFERENCE
        }
Exemple #17
0
        private static Dictionary<AST.Range, InputSample> StoreInputs(AST.Range[] inputs, DAG dag)
        {
            var d = new Dictionary<AST.Range, InputSample>();
            foreach (AST.Range input_range in inputs)
            {
                var com = dag.getCOMRefForRange(input_range);
                var s = new InputSample(com.Height, com.Width);

                // store the entire COM array as a multiarray
                // in one fell swoop.
                s.AddArray(com.Range.Value2);

                // add stored input to dict
                d.Add(input_range, s);

                // this is to force excel to recalculate its outputs
                // exactly the same way that it will for our bootstraps
                BootMemo.ReplaceExcelRange(com.Range, s);
            }

            return d;
        }
Exemple #18
0
        public static Dictionary <AST.Address, string> StoreOutputs(AST.Address[] outputs, DAG dag)
        {
            // output dict
            var d = new Dictionary <AST.Address, string>();

            // partition all of the output addresses by their worksheet
            var addr_groups = outputs.GroupBy(addr => dag.getCOMRefForAddress(addr).WorksheetName);

            // for each worksheet, do an array read of the formulas
            foreach (IEnumerable <AST.Address> ws_fns in addr_groups)
            {
                // get worksheet used range
                var fstcr = dag.getCOMRefForAddress(ws_fns.First());
                var rng   = fstcr.Worksheet.UsedRange;

                // get used range dimensions
                var left   = rng.Column;
                var right  = rng.Columns.Count + left - 1;
                var top    = rng.Row;
                var bottom = rng.Rows.Count + top - 1;

                // get names
                var wsname = new FSharpOption <string>(fstcr.WorksheetName);
                var wbname = new FSharpOption <string>(fstcr.WorkbookName);
                var path   = fstcr.Path;

                // sometimes the used range is a range
                if (left != right || top != bottom)
                {
                    // y is the first index
                    // x is the second index
                    object[,] data = rng.Value2;    // fast array read

                    var x_del = left - 1;
                    var y_del = top - 1;

                    foreach (AST.Address addr in ws_fns)
                    {
                        // construct address in formulas array
                        var x = addr.X - x_del;
                        var y = addr.Y - y_del;

                        // get string
                        String s = System.Convert.ToString(data[y, x]);
                        if (String.IsNullOrWhiteSpace(s))
                        {
                            d.Add(addr, "");
                        }
                        else
                        {
                            d.Add(addr, s);
                        }
                    }
                }
                // and other times it is a single cell
                else
                {
                    // construct the appropriate AST.Address
                    AST.Address addr = AST.Address.NewFromR1C1(top, left, wsname, wbname, path);

                    // make certain that it is actually a string
                    String s = System.Convert.ToString(rng.Value2);

                    // add to dictionary, as appropriate
                    if (String.IsNullOrWhiteSpace(s))
                    {
                        d.Add(addr, "");
                    }
                    else
                    {
                        d.Add(addr, s);
                    }
                }
            }

            return(d);
        }
Exemple #19
0
        public static TreeScore Inference(
            int num_bootstraps,
            InputSample[][] resamples,
            Dictionary <AST.Range, InputSample> initial_inputs,
            Dictionary <AST.Address, string> initial_outputs,
            AST.Range[] input_arr,
            AST.Address[] output_arr,
            DAG dag,
            bool weighted,
            double significance,
            ProgBar pb)
        {
            // synchronization token
            object lock_token = new Object();

            // init thread event notification array
            var mres = new ManualResetEvent[input_arr.Length];

            // init job storage
            var ddjs = new DataDebugJob[input_arr.Length];

            // init started jobs count
            var sjobs = 0;

            // init completed jobs count
            var cjobs = 0;

            // last-ditch effort flag
            bool last_try = false;

            // init score storage
            var scores = new TreeScore();

            for (int i = 0; i < input_arr.Length; i++)
            {
                try
                {
                    #region BOOTSTRAP
                    // bootstrapping is done in the parent STA thread because
                    // the .NET threading model prohibits thread pools (which
                    // are MTA) from accessing STA COM objects directly.

                    // alloc bootstrap storage for each output (f), for each resample (b)
                    FunctionOutput <string>[][] bs = new FunctionOutput <string> [initial_outputs.Count][];
                    for (int f = 0; f < initial_outputs.Count; f++)
                    {
                        bs[f] = new FunctionOutput <string> [num_bootstraps];
                    }

                    // init memoization table for input vector i
                    var memo = new BootMemo();

                    // fetch the input range TreeNode
                    var input = input_arr[i];

                    // fetch the input range COM object
                    var com = dag.getCOMRefForRange(input).Range;

                    // compute outputs
                    // replace the values of the COM object with the jth bootstrap,
                    // save all function outputs, and
                    // restore the original input
                    for (var b = 0; b < num_bootstraps; b++)
                    {
                        // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them
                        FunctionOutput <string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false);
                        for (var f = 0; f < output_arr.Length; f++)
                        {
                            bs[f][b] = fos[f];
                        }
                    }

                    // restore the original inputs; faster to do once, after bootstrapping is done
                    BootMemo.ReplaceExcelRange(com, initial_inputs[input]);

                    // TODO: restore formulas if it turns out that they were overwrittern
                    //       this should never be the case
                    #endregion BOOTSTRAP

                    #region HYPOTHESIS_TEST
                    // cancellation token
                    mres[i] = new ManualResetEvent(false);

                    // set up job
                    ddjs[i] = new DataDebugJob(
                        dag,
                        bs,
                        initial_outputs,
                        input_arr[i],
                        output_arr,
                        weighted,
                        significance,
                        mres[i]
                        );

                    sjobs++;

                    // hand job to thread pool
                    ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i);
                    #endregion HYPOTHESIS_TEST

                    // update progress bar
                    pb.IncrementProgress();
                }
                catch (System.OutOfMemoryException e)
                {
                    if (!last_try)
                    {
                        // If there are no more jobs running, but
                        // we still can't allocate memory, try invoking
                        // GC and then trying again
                        cjobs = mres.Count(mre => mre.WaitOne(0));
                        if (sjobs - cjobs == 0)
                        {
                            GC.Collect();
                            last_try = true;
                        }
                    }
                    else
                    {
                        // we just don't have enough memory
                        throw e;
                    }

                    // wait for any of the 0..i-1 work items
                    // to complete and try again
                    WaitHandle.WaitAny(mres.Take(i).ToArray());
                }
            }

            // Do not proceed until all hypothesis tests are done.
            // WaitHandle.WaitAll cannot be called on an STA thread which
            // is why we call WaitOne in a loop.
            // Merge scores as data becomes available.
            for (int i = 0; i < input_arr.Length; i++)
            {
                mres[i].WaitOne();
                scores = DictAdd(scores, ddjs[i].Result);
            }

            return(scores);
        }
Exemple #20
0
        private static Dictionary <AST.Range, InputSample> StoreInputs(AST.Range[] inputs, DAG dag)
        {
            var d = new Dictionary <AST.Range, InputSample>();

            foreach (AST.Range input_range in inputs)
            {
                var com = dag.getCOMRefForRange(input_range);
                var s   = new InputSample(com.Height, com.Width);

                // store the entire COM array as a multiarray
                // in one fell swoop.
                s.AddArray(com.Range.Value2);

                // add stored input to dict
                d.Add(input_range, s);

                // this is to force excel to recalculate its outputs
                // exactly the same way that it will for our bootstraps
                BootMemo.ReplaceExcelRange(com.Range, s);
            }

            return(d);
        }
Exemple #21
0
        // num_bootstraps: the number of bootstrap samples to get
        // inputs: a list of inputs; each TreeNode represents an entire input range
        // outputs: a list of outputs; each TreeNode represents a function
        public static TreeScore DataDebug(int num_bootstraps,
                                          DAG dag,
                                          Excel.Application app,
                                          bool weighted,
                                          bool all_outputs,
                                          long max_duration_in_ms,
                                          Stopwatch sw,
                                          double significance,
                                          ProgBar pb)
        {
            // this modifies the weights of each node
            PropagateWeights(dag);

            // filter out non-terminal functions
            var output_fns = dag.terminalFormulaNodes(all_outputs);
            // filter out non-terminal inputs
            var input_rngs = dag.terminalInputVectors();

            // first idx: the index of the TreeNode in the "inputs" array
            // second idx: the ith bootstrap
            var resamples = new InputSample[input_rngs.Length][];

            // RNG for sampling
            var rng = new Random();

            // we save initial inputs and outputs here
            var initial_inputs  = StoreInputs(input_rngs, dag);
            var initial_outputs = StoreOutputs(output_fns, dag);

            // Set progress bar max
            pb.setMax(input_rngs.Length * 2);

            #region RESAMPLE

            // populate bootstrap array
            // for each input range (a TreeNode)
            for (int i = 0; i < input_rngs.Length; i++)
            {
                // this TreeNode
                var t = input_rngs[i];

                // resample
                resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng);

                // update progress bar
                pb.IncrementProgress();
            }

            #endregion RESAMPLE

            #region INFERENCE
            return(Inference(
                       num_bootstraps,
                       resamples,
                       initial_inputs,
                       initial_outputs,
                       input_rngs,
                       output_fns,
                       dag,
                       weighted,
                       significance,
                       pb));

            #endregion INFERENCE
        }
        public void Analyze(long max_duration_in_ms)
        {
            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            using (var pb = new ProgBar())
            {
                // Disable screen updating during analysis to speed things up
                _app.ScreenUpdating = false;

                // Build dependency graph (modifies data)
                try
                {
                    _dag = new DAG(_app.ActiveWorkbook, _app, IGNORE_PARSE_ERRORS);
                    var num_input_cells = _dag.numberOfInputCells();
                }
                catch (ExcelParserUtility.ParseException e)
                {
                    // cleanup UI and then rethrow
                    _app.ScreenUpdating = true;
                    throw e;
                }

                if (_dag.terminalInputVectors().Length == 0)
                {
                    System.Windows.Forms.MessageBox.Show("This spreadsheet contains no vector-input functions.");
                    _app.ScreenUpdating = true;
                    _flaggable = new KeyValuePair<AST.Address, int>[0];
                    return;
                }

                // Get bootstraps
                var scores = Analysis.DataDebug(NBOOTS,
                                                _dag,
                                                _app,
                                                weighted: USE_WEIGHTS,
                                                all_outputs: CONSIDER_ALL_OUTPUTS,
                                                max_duration_in_ms: max_duration_in_ms,
                                                sw: sw,
                                                significance: _tool_significance,
                                                pb: pb)
                                     .OrderByDescending(pair => pair.Value).ToArray();

                if (_debug_mode)
                {
                    var score_str = String.Join("\n", scores.Take(10).Select(score => score.Key.A1FullyQualified() + " -> " + score.Value.ToString()));
                    System.Windows.Forms.MessageBox.Show(score_str);
                    System.Windows.Forms.Clipboard.SetText(score_str);
                }

                List<KeyValuePair<AST.Address, int>> high_scores = new List<KeyValuePair<AST.Address, int>>();

                // calculate cutoff idnex
                int thresh = scores.Length - Convert.ToInt32(scores.Length * _tool_significance);

                // filter out cells that are...
                _flaggable = scores.Where(pair => pair.Value >= scores[thresh].Value)   // below threshold
                                   .Where(pair => !_known_good.Contains(pair.Key))      // known to be good
                                   .Where(pair => pair.Value != 0).ToArray();           // score == 0

                // Enable screen updating when we're done
                _app.ScreenUpdating = true;

                sw.Stop();
            }
        }
Exemple #23
0
        private static int PropagateNodeWeight(AST.Address node, DAG dag)
        {
            // if the node is a formula, recursively
            // compute its weight
            if (dag.isFormula(node))
            {
                // get input nodes
                var vector_rngs = dag.getFormulaInputVectors(node);
                var scinputs = dag.getFormulaSingleCellInputs(node);
                var inputs = vector_rngs.SelectMany(vrng => vrng.Addresses()).ToList();
                inputs.AddRange(scinputs);

                // call recursively and sum components
                var weight = 0;
                foreach (var input in inputs)
                {
                    weight += PropagateNodeWeight(input, dag);
                }
                dag.setWeight(node, weight);
                return weight;
            }
            // node is an input
            else
            {
                dag.setWeight(node, 1);
                return 1;
            }
        }
Exemple #24
0
        // Propagate weights
        private static void PropagateWeights(DAG dag)
        {
            if (dag.containsLoop())
            {
                throw new ContainsLoopException();
            }

            // starting set of functions; roots in the forest
            var formulas = dag.terminalFormulaNodes(false);

            // for each forest
            foreach (AST.Address f in formulas)
            {
                dag.setWeight(f, PropagateNodeWeight(f, dag));
            }
        }
Exemple #25
0
 // save all of the values of the spreadsheet that
 // participate in any computation
 public static CellDict SaveInputs(DAG dag)
 {
     try
     {
         var cd = new CellDict();
         foreach (var addr in dag.allComputationCells())
         {
             cd.Add(addr, dag.readCOMValueAtAddress(addr));
         }
         return cd;
     }
     catch (Exception e)
     {
         throw new Exception(String.Format("Failed in SaveInputs: {0}", e.Message));
     }
 }
Exemple #26
0
 public DataDebugJob(
     DAG dag,
     FunctionOutput<String>[][] bs,
     Dictionary<AST.Address, string> initial_outputs,
     AST.Range input,
     AST.Address[] output_arr,
     bool weighted,
     double significance,
     ManualResetEvent mre)
 {
     _dag = dag;
     _bs = bs;
     _initial_outputs = initial_outputs;
     _input = input;
     _outputs = output_arr;
     _weighted = weighted;
     _significance = significance;
     _mre = mre;
     _score = new TreeScore();
 }
Exemple #27
0
        // Get dictionary of inputs and the error they produce
        public static CellDict GenImportantErrors(AST.Address[] output_nodes,
            CellDict inputs,
            int k,         // number of alternatives to consider
            CellDict correct_outputs,
            Excel.Application app,
            Excel.Workbook wb,
            Classification c,
            DAG dag)
        {
            var eg = new ErrorGenerator();
            var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>();

            foreach (KeyValuePair<AST.Address, string> pair in inputs)
            {
                AST.Address addr = pair.Key;
                string orig_value = pair.Value;

                //Load in the classification's dictionaries
                double max_error_produced = 0.0;
                string max_error_string = "";

                // get k strings
                string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k);

                for (int i = 0; i < k; i++)
                {
                    CellDict cd = new CellDict();
                    cd.Add(addr, errorstrings[i]);
                    //inject the typo
                    InjectValues(app, wb, cd);

                    // save function outputs
                    CellDict incorrect_outputs = SaveOutputs(output_nodes, dag);

                    //remove the typo that was introduced
                    cd.Clear();
                    cd.Add(addr, orig_value);
                    InjectValues(app, wb, cd);

                    double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs);

                    //keep track of the largest observed max error
                    if (total_error > max_error_produced)
                    {
                        max_error_produced = total_error;
                        max_error_string = errorstrings[i];
                    }
                }
                //Add entry for this TreeNode in our dictionary with its max_error_produced
                max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced));
            }

            // sort by max_error_produced
            var maxen = max_error_produced_dictionary.OrderByDescending(pair => pair.Value.Item2).Select(pair => new Tuple<AST.Address, string>(pair.Key, pair.Value.Item1)).ToList();

            return maxen.Take((int)Math.Ceiling(0.05 * inputs.Count)).ToDictionary(tup => tup.Item1, tup => tup.Item2);
        }
Exemple #28
0
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            CutoffKind ck,              // kind of threshold function to use
            Excel.Application app,      // reference to Excel app
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,
            Excel.Workbook wb,
            AST.Address[] terminal_formula_cells,
            AST.Range[] terminal_input_vectors,
            CellDict original_inputs,
            CellDict correct_outputs,
            long max_duration_in_ms,
            String logfile,              //filename for the output log
            ProgBar pb
            )
        {
            //set wbname and path
            _wb_name = xlfile;
            _wb_path = wb.Path;
            _analysis_type = analysisType;
            _significance = significance;
            _all_outputs = all_outputs;
            _weighted = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();
            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;
            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);
            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort = dag.allCells().Length;
            _effort = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return _effort;
        }
        public static AST.Address NormalPerRange_Step(DAG dag,
            Excel.Workbook wb,
            HashSet<AST.Address> known_good,
            long max_duration_in_ms,
            Stopwatch sw)
        {
            AST.Address flagged_cell = null;

            //Generate normal distributions for every input range until an error is found
            //Then break out of the loop and report it.
            foreach (var vect_addr in dag.allVectors())
            {
                var normal_dist = new DataDebugMethods.NormalDistribution(dag.getCOMRefForRange(vect_addr).Range);

                // Get top outlier which has not been inspected already
                if (normal_dist.getErrorsCount() > 0)
                {
                    for (int i = 0; i < normal_dist.getErrorsCount(); i++)
                    {
                        // check for timeout
                        if (sw.ElapsedMilliseconds > max_duration_in_ms)
                        {
                            throw new TimeoutException("Timeout exception in NormalPerRange_Step.");
                        }

                        var flagged_com = normal_dist.getErrorAtPosition(i);
                        flagged_cell = AST.Address.AddressFromCOMObject(flagged_com, wb);
                        if (known_good.Contains(flagged_cell))
                        {
                            flagged_cell = null;
                        }
                        else
                        {
                            break;
                        }
                    }
                }
                //If a cell is flagged, do not move on to the next range (if you do, you'll overwrite the flagged_cell
                if (flagged_cell != null)
                {
                    break;
                }
            }

            return flagged_cell;
        }
Exemple #30
0
        // save spreadsheet outputs to a CellDict
        public static CellDict SaveOutputs(AST.Address[] formula_nodes, DAG dag)
        {
            var cd = new CellDict();
            foreach (AST.Address formula_addr in formula_nodes)
            {
                // throw an exception in debug mode, because this should never happen
                #if DEBUG
                if (!(bool)(dag.getCOMRefForAddress(formula_addr).Range.HasFormula))
                {
                    String fstring = dag.getFormulaAtAddress(formula_addr);
                    throw new Exception("Formula address is not a formula.");
                }
                #endif

                // save value
                if (cd.ContainsKey(formula_addr))
                {
                    throw new Exception(String.Format("Failed in SaveOutputs."));
                }
                else
                {
                    cd.Add(formula_addr, dag.readCOMValueAtAddress(formula_addr));
                }
            }
            return cd;
        }
Exemple #31
0
        // For running a simulation from the batch runner
        // returns the number of cells inspected
        public int RunFromBatch(int nboots,        // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            Excel.Application app,      // reference to Excel app
            CutoffKind ck,
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,          // the computation tree of the spreadsheet
            Excel.Workbook wb,          // the workbook being analyzed
            CellDict errors,            // the errors that will be introduced in the spreadsheet
            AST.Range[] terminal_input_vectors,   // the inputs
            AST.Address[] terminal_formula_cells, // the outputs
            CellDict original_inputs,          // original values of the inputs
            CellDict correct_outputs,          // the correct outputs
            long max_duration_in_ms,
            String logfile              //filename for the output log
            )
        {
            if (terminal_input_vectors.Length == 0)
            {
                throw new NoRangeInputs();
            }

            if (original_inputs.Count() == 0)
            {
                throw new NoFormulas();
            }

            _errors = errors;

            // find the error with the largest magnitude
            // this is mostly useful for the single-perturbation experiments
            var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));
            var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));

            _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key]))
                    ).Max() : 0;
            _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key])
                    ).Max() : 0;

            // find the output with the largest magnitude
            var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value));
            var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value));

            _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key]))
                    ).Max() : 0;
            _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key])
                    ).Max() : 0;

            return Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null);
        }
        public void TestGetFormulaRanges()
        {
            var mwb = new MockWorkbook();

            // rnd, for random formulae assignment
            Random rand = new Random();

            // gin up some formulae
            Tuple<string, string>[] fs = {new Tuple<string,string>("B4", "=COUNT(A1:A5)"),
                                         new Tuple<string,string>("A6", "=SUM(B5:B40)"),
                                         new Tuple<string,string>("Z2", "=AVERAGE(A1:E1)"),
                                         new Tuple<string,string>("B44", "=MEDIAN(D4:D9)")};

            // to keep track of what we did
            var d = new System.Collections.Generic.Dictionary<Excel.Worksheet, System.Collections.Generic.List<Tuple<string, string>>>();

            // add the formulae to the worksheets, randomly
            foreach (Excel.Worksheet w in mwb.GetWorksheets())
            {
                // init list for each worksheet
                d[w] = new System.Collections.Generic.List<Tuple<string, string>>();

                // add the formulae, randomly
                foreach (var f in fs)
                {
                    if (rand.Next(0, 2) == 0)
                    {
                        w.Range[f.Item1, f.Item1].Formula = f.Item2;
                        // keep track of what we did
                        d[w].Add(f);
                    }
                }
                // we need at least one formula, so add one if the above procedure did not
                if (d[w].Count() == 0)
                {
                    w.Range[fs[0].Item1, fs[0].Item1].Formula = fs[0].Item2;
                    d[w].Add(fs[0]);
                }
            }

            // init DAG
            var dag = new DAG(mwb.GetWorkbook(), mwb.GetApplication(), false);

            // get the formulas; 1 formula per worksheet
            var formulas = dag.getAllFormulaAddrs();

            // there should be e.Count + 3 entries
            // don't forget: workbooks have 3 blank worksheets by default
            var expected = d.Values.Select(v => v.Count).Aggregate((acc, c) => acc + c);
            if (formulas.Length != expected)
            {
                throw new Exception("DAG.getAllFormulaAddrs() should return " + expected + " elements but instead returns " + formulas.Length + ".");
            }

            bool all_ok = true;

            // make sure that each worksheet's range has the formulas that it should
            var f_wsgroups = formulas.GroupBy(f => f.GetCOMObject(mwb.GetApplication()).Worksheet);

            foreach (var pair in f_wsgroups)
            {
                // get formulas in this worksheet
                var r = pair.Key.UsedRange.SpecialCells(Excel.XlCellType.xlCellTypeFormulas);

                // check that all formulae for this worksheet are accounted for
                bool r_ok = d[r.Worksheet].Aggregate(true, (bool acc, Tuple<string, string> f) =>
                {
                    bool found = false;
                    foreach (Excel.Range cell in r)
                    {
                        if (String.Equals((string)cell.Formula, f.Item2))
                        {
                            found = true;
                        }
                    }
                    return acc && found;
                });

                all_ok = all_ok && r_ok;
            }

            if (!all_ok)
            {
                throw new Exception("ConstructTree.GetFormulaRanges() failed to return all of the formulae that were added.");
            }
        }
Exemple #33
0
        // remove errors until none remain
        private UserResults SimulateUser(int nboots,
            double significance,
            CutoffKind ck,
            DAG dag,
            CellDict original_inputs,
            CellDict errord,
            CellDict correct_outputs,
            Excel.Workbook wb,
            Excel.Application app,
            AnalysisType analysis_type,
            bool weighted,
            bool all_outputs,
            long max_duration_in_ms,
            Stopwatch sw,
            String logfile,
            ProgBar pb
            )
        {
            // init user results data structure
            var o = new UserResults();
            HashSet<AST.Address> known_good = new HashSet<AST.Address>();

            // initialize procedure
            var errors_remain = true;
            var max_errors = new ErrorDict();
            var incorrect_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
            var errors_found = 0;
            var number_of_true_errors = errord.Count;
            Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors);

            // the corrected state of the spreadsheet
            CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value);

            // remove errors loop
            var cells_inspected = 0;
            List<KeyValuePair<AST.Address, int>> filtered_high_scores = null;
            bool correction_made = true;
            while (errors_remain)
            {
                Console.Write(".");

                AST.Address flagged_cell = null;

                // choose the appropriate test
                if (analysis_type == AnalysisType.CheckCell5 ||
                    analysis_type == AnalysisType.CheckCell10
                    )

                {
                    flagged_cell = SimulationStep.CheckCell_Step(o,
                                                  significance,
                                                  ck,
                                                  nboots,
                                                  dag,
                                                  app,
                                                  weighted,
                                                  all_outputs,
                                                  correction_made,
                                                  known_good,
                                                  ref filtered_high_scores,
                                                  max_duration_in_ms,
                                                  sw,
                                                  pb);
                } else if (analysis_type == AnalysisType.NormalPerRange)
                {
                    flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw);
                }
                else if (analysis_type == AnalysisType.NormalAllInputs)
                {
                    flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw);
                }

                // stop if the test no longer returns anything or if
                // the test is simply done inspecting based on a fixed threshold
                if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected))
                {
                    errors_remain = false;
                }
                else    // a cell was flagged
                {
                    //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged,
                    //then nothing is inspected, so cells_inspected doesn't increase.
                    cells_inspected += 1;

                    // check to see if the flagged value is actually an error
                    if (errord.ContainsKey(flagged_cell))
                    {
                        correction_made = true;
                        errors_found += 1;
                        // P(k) * rel(k)
                        o.PrecRel_at_k.Add(errors_found / (double)cells_inspected);
                        o.true_positives.Add(flagged_cell);

                        // correct flagged cell
                        flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell];

                        Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors);

                        // compute total error after applying this correction
                        var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs);
                        o.current_total_error.Add(current_total_error);

                        // save outputs
                        partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
                    }
                    else
                    {
                        correction_made = false;
                        // numerator is 0 here because rel(k) = 0 when no error was found
                        o.PrecRel_at_k.Add(0.0);
                        o.false_positives.Add(flagged_cell);
                    }

                    // mark it as known good -- at this point the cell has been
                    //      'inspected' regardless of whether it was an error
                    //      It was either corrected or marked as OK
                    known_good.Add(flagged_cell);

                    // compute output error magnitudes
                    var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);
                    // compute input error magnitude
                    double num_input_error_magnitude;
                    double str_input_error_magnitude;
                    if (errord.ContainsKey(flagged_cell))
                    {
                        if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell]))
                        {
                            num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell]));
                            str_input_error_magnitude = 0;
                        }
                        else
                        {
                            num_input_error_magnitude = 0;
                            str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]);
                        }
                    }
                    else
                    {
                        num_input_error_magnitude = 0;
                        str_input_error_magnitude = 0;
                    }

                    // write error log
                    var logentry = new LogEntry(analysis_type,
                                                wb.Name,
                                                flagged_cell,
                                                original_inputs[flagged_cell],
                                                errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell],
                                                output_error_magnitude,
                                                num_input_error_magnitude,
                                                str_input_error_magnitude,
                                                true,
                                                correction_made,
                                                significance,
                                                ck.Threshold);
                    logentry.WriteLog(logfile);
                    _error_log.Add(logentry);
                }
            }

            // find all of the false negatives
            o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord);
            o.max_errors = max_errors;

            var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);

            // write out all false negative information
            foreach (AST.Address fn in o.false_negatives)
            {
                double num_input_error_magnitude;
                double str_input_error_magnitude;
                if (Utility.BothNumbers(errord[fn], original_inputs[fn]))
                {
                    num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn]));
                    str_input_error_magnitude = 0;
                }
                else
                {
                    num_input_error_magnitude = 0;
                    str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]);
                }

                // write error log
                _error_log.Add(new LogEntry(analysis_type,
                                            wb.Name,
                                            fn,
                                            original_inputs[fn],
                                            errord[fn],
                                            last_out_err_mag,
                                            num_input_error_magnitude,
                                            str_input_error_magnitude,
                                            false,
                                            true,
                                            significance,
                                            ck.Threshold));
            }
            return o;
        }