Example #1
0
        public static PrepData PrepSimulation(Excel.Application app, Excel.Workbook wbh, ProgBar pb, bool ignore_parse_errors)
        {
            // build graph
            var dag = new DAG(wbh, app, ignore_parse_errors);
            if (dag.containsLoop())
            {
                throw new DataDebugMethods.ContainsLoopException();
            }
            pb.IncrementProgress();

            // get terminal input and terminal formula nodes once
            var terminal_input_nodes = dag.terminalInputVectors();
            var terminal_formula_nodes = dag.terminalFormulaNodes(true);  ///the boolean indicates whether to use all outputs or not

            if (terminal_input_nodes.Length == 0)
            {
                throw new NoRangeInputs();
            }

            if (terminal_formula_nodes.Length == 0)
            {
                throw new NoFormulas();
            }

            // save original spreadsheet state
            CellDict original_inputs = UserSimulation.Utility.SaveInputs(dag);

            // force a recalculation before saving outputs, otherwise we may
            // erroneously conclude that the procedure did the wrong thing
            // based solely on Excel floating-point oddities
            UserSimulation.Utility.InjectValues(app, wbh, original_inputs);

            // save function outputs
            CellDict correct_outputs = UserSimulation.Utility.SaveOutputs(terminal_formula_nodes, dag);

            return new PrepData()
            {
                dag = dag,
                original_inputs = original_inputs,
                correct_outputs = correct_outputs,
                terminal_input_nodes = terminal_input_nodes,
                terminal_formula_nodes = terminal_formula_nodes
            };
        }
Example #2
0
        // this function returns an address but also updates
        // the filtered_high_scores list
        public static AST.Address CheckCell_Step(UserResults o,
            double significance,
            CutoffKind ck,
            int nboots,
            DAG dag,
            Excel.Application app,
            bool weighted,
            bool all_outputs,
            bool run_bootstrap,
            HashSet<AST.Address> known_good,
            ref List<KeyValuePair<AST.Address, int>> filtered_high_scores,
            long max_duration_in_ms,
            Stopwatch sw,
            ProgBar pb)
        {
            // Get bootstraps
            // The bootstrap should only re-run if there is a correction made,
            //      not when something is marked as OK (isn't one of the introduced errors)
            // The list of suspected cells doesn't change when we mark something as OK,
            //      we just move on to the next thing in the list
            if (run_bootstrap)
            {
                TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb);

                // apply a threshold to the scores
                filtered_high_scores = ck.applyCutoff(scores, known_good);
            }
            else  //if no corrections were made (a cell was marked as OK, not corrected)
            {
                //re-filter out cells marked as OK
                filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList();
            }

            if (filtered_high_scores.Count() != 0)
            {
                // get AST.Address corresponding to most unusual score
                return filtered_high_scores[0].Key;
            }
            else
            {
                return null;
            }
        }
Example #3
0
        public static void RunProportionExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // init error generator
            var eg = new ErrorGenerator();

            // get inputs as an array of addresses to facilitate random selection
            // DATA INPUTS ONLY
            AST.Address[] inputs = prepdata.dag.terminalInputCells();

            // sanity check: all of the inputs should also be in prepdata.original_inputs
            foreach (AST.Address addr in inputs)
            {
                if (!prepdata.original_inputs.ContainsKey(addr))
                {
                    throw new Exception("Missing address!");
                }
            }

            for (int i = 0; i < 100; i++)
            {
                // randomly choose an input address
                AST.Address rand_addr = inputs[r.Next(inputs.Length)];

                // get the value
                String input_value = prepdata.original_inputs[rand_addr];

                // perturb it
                String erroneous_input = eg.GenerateErrorString(input_value, c);

                // create an error dictionary with this one perturbed value
                var errors = new CellDict();
                errors.Add(rand_addr, erroneous_input);

                // run simulations; simulation code does insertion of errors and restore of originals
                RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
            }
        }
Example #4
0
        public static void RunSimulation(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, PrepData prepdata, CellDict errors)
        {
            // write header if needed
            if (!System.IO.File.Exists(outfile))
            {
                System.IO.File.AppendAllText(outfile, Simulation.HeaderRowForCSV());
            }

            // CheckCell weighted, all outputs, quantile
            //var s_1 = new UserSimulation.Simulation();
            //s_1.RunFromBatch(nboots,                                   // number of bootstraps
            //                    wbh.FullName,                          // Excel filename
            //                    significance,                          // statistical significance threshold for hypothesis test
            //                    app,                                   // Excel.Application
            //                    new QuantileCutoff(0.05),              // max % extreme values to flag
            //                    c,                                     // classification data
            //                    r,                                     // random number generator
            //                    UserSimulation.AnalysisType.CheckCell5,// analysis type
            //                    true,                                  // weighted analysis
            //                    true,                                  // use all outputs for analysis
            //                    prepdata.graph,                                 // AnalysisData
            //                    wbh,                                   // Excel.Workbook
            //                    errors,                                // pre-generated errors
            //                    prepdata.terminal_input_nodes,                  // input range nodes
            //                    prepdata.terminal_formula_nodes,                // output nodes
            //                    prepdata.original_inputs,                       // original input values
            //                    prepdata.correct_outputs,                       // original output values
            //                    max_duration_in_ms,                    // max duration of simulation
            //                    logfile);
            //System.IO.File.AppendAllText(outfile, s_1.FormatResultsAsCSV());
            pb.IncrementProgress();

            // CheckCell weighted, all outputs, quantile
            var s_4 = new UserSimulation.Simulation();
            s_4.RunFromBatch(nboots,                                   // number of bootstraps
                                wbh.FullName,                          // Excel filename
                                significance,                          // statistical significance of threshold
                                app,                                   // Excel.Application
                                new QuantileCutoff(0.10),              // max % extreme values to flag
                                c,                                     // classification data
                                r,                                     // random number generator
                                UserSimulation.AnalysisType.CheckCell10,// analysis type
                                true,                                  // weighted analysis
                                true,                                  // use all outputs for analysis
                                prepdata.dag,                                 // AnalysisData
                                wbh,                                   // Excel.Workbook
                                errors,                                // pre-generated errors
                                prepdata.terminal_input_nodes,                  // input range nodes
                                prepdata.terminal_formula_nodes,                // output nodes
                                prepdata.original_inputs,                       // original input values
                                prepdata.correct_outputs,                       // original output values
                                max_duration_in_ms,                    // max duration of simulation
                                logfile);
            System.IO.File.AppendAllText(outfile, s_4.FormatResultsAsCSV());
            pb.IncrementProgress();

            // Normal, all inputs
            var s_2 = new UserSimulation.Simulation();
            s_2.RunFromBatch(nboots,                                   // irrelevant
                                wbh.FullName,                              // Excel filename
                                significance,                          // normal cutoff?
                                app,                                   // Excel.Application
                                new NormalCutoff(threshold),           // ??
                                c,                                     // classification data
                                r,                                     // random number generator
                                UserSimulation.AnalysisType.NormalAllInputs,   // analysis type
                                true,                                  // irrelevant
                                true,                                  // irrelevant
                                prepdata.dag,                                 // AnalysisData
                                wbh,                                   // Excel.Workbook
                                errors,                                // pre-generated errors
                                prepdata.terminal_input_nodes,                  // input range nodes
                                prepdata.terminal_formula_nodes,                // output nodes
                                prepdata.original_inputs,                       // original input values
                                prepdata.correct_outputs,                       // original output values
                                max_duration_in_ms,                    // max duration of simulation
                                logfile);
            System.IO.File.AppendAllText(outfile, s_2.FormatResultsAsCSV());
            pb.IncrementProgress();

            // Normal, range inputs
            //var s_3 = new UserSimulation.Simulation();
            //s_3.RunFromBatch(nboots,                                   // irrelevant
            //                    wbh.FullName,                              // Excel filename
            //                    significance,                          // normal cutoff?
            //                    app,                                   // Excel.Application
            //                    new NormalCutoff(threshold),           // ??
            //                    c,                                     // classification data
            //                    r,                                     // random number generator
            //                    UserSimulation.AnalysisType.NormalPerRange,   // analysis type
            //                    true,                                  // irrelevant
            //                    true,                                  // irrelevant
            //                    prepdata.graph,                                 // AnalysisData
            //                    wbh,                                   // Excel.Workbook
            //                    errors,                                // pre-generated errors
            //                    prepdata.terminal_input_nodes,                  // input range nodes
            //                    prepdata.terminal_formula_nodes,                // output nodes
            //                    prepdata.original_inputs,                       // original input values
            //                    prepdata.correct_outputs,                       // original output values
            //                    max_duration_in_ms,                    // max duration of simulation
            //                    logfile);
            //System.IO.File.AppendAllText(outfile, s_3.FormatResultsAsCSV());
            pb.IncrementProgress();
        }
Example #5
0
        public static bool RunSubletyExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // init error generator
            var eg = new ErrorGenerator();

            // get inputs as an array of addresses to facilitate random selection
            // DATA INPUTS ONLY
            AST.Address[] inputs = prepdata.dag.terminalInputCells();

            for (int i = 0; i < 100; i++)
            {
                // randomly choose a *numeric* input
                // TODO: use Fischer-Yates and take values until
                // either we have a satisfactory input value or none
                // remain
                var rnd_addrs = inputs.Shuffle().ToList();
                bool num_found = false;
                String input_string;
                double input_value;
                AST.Address rand_addr;
                do
                {
                    // randomly choose an address; if there are none left, fail
                    if (rnd_addrs.Count == 0) {
                        return false;
                    }
                    rand_addr = rnd_addrs.First();
                    rnd_addrs = rnd_addrs.Skip(1).ToList();

                    // get the value
                    input_string = prepdata.original_inputs[rand_addr];

                    // try parsing it
                    if (Double.TryParse(input_string, out input_value))
                    {
                        num_found = true;
                    }
                } while (!num_found);

                // perturb it
                String erroneous_input = eg.GenerateSubtleErrorString(input_value, c);

                // create an error dictionary with this one perturbed value
                var errors = new CellDict();
                errors.Add(rand_addr, erroneous_input);

                // run simulations; simulation code does insertion of errors and restore of originals
                RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
            }

            return true;
        }
Example #6
0
        public static void RunSimulationPaperMain(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // generate errors
            CellDict errors = UserSimulation.Utility.GenImportantErrors(prepdata.terminal_formula_nodes,
                                                               prepdata.original_inputs,
                                                               5,
                                                               prepdata.correct_outputs,
                                                               app,
                                                               wbh,
                                                               c,
                                                               prepdata.dag);
            // run paper simulations
            RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
        }
Example #7
0
        // num_bootstraps: the number of bootstrap samples to get
        // inputs: a list of inputs; each TreeNode represents an entire input range
        // outputs: a list of outputs; each TreeNode represents a function
        public static TreeScore DataDebug(int num_bootstraps,
            DAG dag,
            Excel.Application app,
            bool weighted,
            bool all_outputs,
            long max_duration_in_ms,
            Stopwatch sw,
            double significance,
            ProgBar pb)
        {
            // this modifies the weights of each node
            PropagateWeights(dag);

            // filter out non-terminal functions
            var output_fns = dag.terminalFormulaNodes(all_outputs);
            // filter out non-terminal inputs
            var input_rngs = dag.terminalInputVectors();

            // first idx: the index of the TreeNode in the "inputs" array
            // second idx: the ith bootstrap
            var resamples = new InputSample[input_rngs.Length][];

            // RNG for sampling
            var rng = new Random();

            // we save initial inputs and outputs here
            var initial_inputs = StoreInputs(input_rngs, dag);
            var initial_outputs = StoreOutputs(output_fns, dag);

            // Set progress bar max
            pb.setMax(input_rngs.Length * 2);

            #region RESAMPLE

            // populate bootstrap array
            // for each input range (a TreeNode)
            for (int i = 0; i < input_rngs.Length; i++)
            {
                // this TreeNode
                var t = input_rngs[i];

                // resample
                resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng);

                // update progress bar
                pb.IncrementProgress();
            }

            #endregion RESAMPLE

            #region INFERENCE
            return Inference(
                num_bootstraps,
                resamples,
                initial_inputs,
                initial_outputs,
                input_rngs,
                output_fns,
                dag,
                weighted,
                significance,
                pb);
            #endregion INFERENCE
        }
Example #8
0
        public static TreeScore Inference(
            int num_bootstraps,
            InputSample[][] resamples,
            Dictionary<AST.Range, InputSample> initial_inputs,
            Dictionary<AST.Address, string> initial_outputs,
            AST.Range[] input_arr,
            AST.Address[] output_arr,
            DAG dag,
            bool weighted,
            double significance,
            ProgBar pb)
        {
            // synchronization token
            object lock_token = new Object();

            // init thread event notification array
            var mres = new ManualResetEvent[input_arr.Length];

            // init job storage
            var ddjs = new DataDebugJob[input_arr.Length];

            // init started jobs count
            var sjobs = 0;

            // init completed jobs count
            var cjobs = 0;

            // last-ditch effort flag
            bool last_try = false;

            // init score storage
            var scores = new TreeScore();

            for (int i = 0; i < input_arr.Length; i++)
            {
                try
                {
                    #region BOOTSTRAP
                    // bootstrapping is done in the parent STA thread because
                    // the .NET threading model prohibits thread pools (which
                    // are MTA) from accessing STA COM objects directly.

                    // alloc bootstrap storage for each output (f), for each resample (b)
                    FunctionOutput<string>[][] bs = new FunctionOutput<string>[initial_outputs.Count][];
                    for (int f = 0; f < initial_outputs.Count; f++)
                    {
                        bs[f] = new FunctionOutput<string>[num_bootstraps];
                    }

                    // init memoization table for input vector i
                    var memo = new BootMemo();

                    // fetch the input range TreeNode
                    var input = input_arr[i];

                    // fetch the input range COM object
                    var com = dag.getCOMRefForRange(input).Range;

                    // compute outputs
                    // replace the values of the COM object with the jth bootstrap,
                    // save all function outputs, and
                    // restore the original input
                    for (var b = 0; b < num_bootstraps; b++)
                    {
                        // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them
                        FunctionOutput<string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false);
                        for (var f = 0; f < output_arr.Length; f++)
                        {
                            bs[f][b] = fos[f];
                        }
                    }

                    // restore the original inputs; faster to do once, after bootstrapping is done
                    BootMemo.ReplaceExcelRange(com, initial_inputs[input]);

                    // TODO: restore formulas if it turns out that they were overwrittern
                    //       this should never be the case
                    #endregion BOOTSTRAP

                    #region HYPOTHESIS_TEST
                    // cancellation token
                    mres[i] = new ManualResetEvent(false);

                    // set up job
                    ddjs[i] = new DataDebugJob(
                                dag,
                                bs,
                                initial_outputs,
                                input_arr[i],
                                output_arr,
                                weighted,
                                significance,
                                mres[i]
                                );

                    sjobs++;

                    // hand job to thread pool
                    ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i);
                    #endregion HYPOTHESIS_TEST

                    // update progress bar
                    pb.IncrementProgress();
                }
                catch (System.OutOfMemoryException e)
                {
                    if (!last_try)
                    {
                        // If there are no more jobs running, but
                        // we still can't allocate memory, try invoking
                        // GC and then trying again
                        cjobs = mres.Count(mre => mre.WaitOne(0));
                        if (sjobs - cjobs == 0)
                        {
                            GC.Collect();
                            last_try = true;
                        }
                    }
                    else
                    {
                        // we just don't have enough memory
                        throw e;
                    }

                    // wait for any of the 0..i-1 work items
                    // to complete and try again
                    WaitHandle.WaitAny(mres.Take(i).ToArray());
                }
            }

            // Do not proceed until all hypothesis tests are done.
            // WaitHandle.WaitAll cannot be called on an STA thread which
            // is why we call WaitOne in a loop.
            // Merge scores as data becomes available.
            for (int i = 0; i < input_arr.Length; i++)
            {
                mres[i].WaitOne();
                scores = DictAdd(scores, ddjs[i].Result);
            }

            return scores;
        }
Example #9
0
        public static TreeScore Inference(
            int num_bootstraps,
            InputSample[][] resamples,
            Dictionary <AST.Range, InputSample> initial_inputs,
            Dictionary <AST.Address, string> initial_outputs,
            AST.Range[] input_arr,
            AST.Address[] output_arr,
            DAG dag,
            bool weighted,
            double significance,
            ProgBar pb)
        {
            // synchronization token
            object lock_token = new Object();

            // init thread event notification array
            var mres = new ManualResetEvent[input_arr.Length];

            // init job storage
            var ddjs = new DataDebugJob[input_arr.Length];

            // init started jobs count
            var sjobs = 0;

            // init completed jobs count
            var cjobs = 0;

            // last-ditch effort flag
            bool last_try = false;

            // init score storage
            var scores = new TreeScore();

            for (int i = 0; i < input_arr.Length; i++)
            {
                try
                {
                    #region BOOTSTRAP
                    // bootstrapping is done in the parent STA thread because
                    // the .NET threading model prohibits thread pools (which
                    // are MTA) from accessing STA COM objects directly.

                    // alloc bootstrap storage for each output (f), for each resample (b)
                    FunctionOutput <string>[][] bs = new FunctionOutput <string> [initial_outputs.Count][];
                    for (int f = 0; f < initial_outputs.Count; f++)
                    {
                        bs[f] = new FunctionOutput <string> [num_bootstraps];
                    }

                    // init memoization table for input vector i
                    var memo = new BootMemo();

                    // fetch the input range TreeNode
                    var input = input_arr[i];

                    // fetch the input range COM object
                    var com = dag.getCOMRefForRange(input).Range;

                    // compute outputs
                    // replace the values of the COM object with the jth bootstrap,
                    // save all function outputs, and
                    // restore the original input
                    for (var b = 0; b < num_bootstraps; b++)
                    {
                        // lookup outputs from memo table; otherwise do replacement, compute outputs, store them in table, and return them
                        FunctionOutput <string>[] fos = memo.FastReplace(com, dag, initial_inputs[input], resamples[i][b], output_arr, false);
                        for (var f = 0; f < output_arr.Length; f++)
                        {
                            bs[f][b] = fos[f];
                        }
                    }

                    // restore the original inputs; faster to do once, after bootstrapping is done
                    BootMemo.ReplaceExcelRange(com, initial_inputs[input]);

                    // TODO: restore formulas if it turns out that they were overwrittern
                    //       this should never be the case
                    #endregion BOOTSTRAP

                    #region HYPOTHESIS_TEST
                    // cancellation token
                    mres[i] = new ManualResetEvent(false);

                    // set up job
                    ddjs[i] = new DataDebugJob(
                        dag,
                        bs,
                        initial_outputs,
                        input_arr[i],
                        output_arr,
                        weighted,
                        significance,
                        mres[i]
                        );

                    sjobs++;

                    // hand job to thread pool
                    ThreadPool.QueueUserWorkItem(ddjs[i].threadPoolCallback, i);
                    #endregion HYPOTHESIS_TEST

                    // update progress bar
                    pb.IncrementProgress();
                }
                catch (System.OutOfMemoryException e)
                {
                    if (!last_try)
                    {
                        // If there are no more jobs running, but
                        // we still can't allocate memory, try invoking
                        // GC and then trying again
                        cjobs = mres.Count(mre => mre.WaitOne(0));
                        if (sjobs - cjobs == 0)
                        {
                            GC.Collect();
                            last_try = true;
                        }
                    }
                    else
                    {
                        // we just don't have enough memory
                        throw e;
                    }

                    // wait for any of the 0..i-1 work items
                    // to complete and try again
                    WaitHandle.WaitAny(mres.Take(i).ToArray());
                }
            }

            // Do not proceed until all hypothesis tests are done.
            // WaitHandle.WaitAll cannot be called on an STA thread which
            // is why we call WaitOne in a loop.
            // Merge scores as data becomes available.
            for (int i = 0; i < input_arr.Length; i++)
            {
                mres[i].WaitOne();
                scores = DictAdd(scores, ddjs[i].Result);
            }

            return(scores);
        }
Example #10
0
        // num_bootstraps: the number of bootstrap samples to get
        // inputs: a list of inputs; each TreeNode represents an entire input range
        // outputs: a list of outputs; each TreeNode represents a function
        public static TreeScore DataDebug(int num_bootstraps,
                                          DAG dag,
                                          Excel.Application app,
                                          bool weighted,
                                          bool all_outputs,
                                          long max_duration_in_ms,
                                          Stopwatch sw,
                                          double significance,
                                          ProgBar pb)
        {
            // this modifies the weights of each node
            PropagateWeights(dag);

            // filter out non-terminal functions
            var output_fns = dag.terminalFormulaNodes(all_outputs);
            // filter out non-terminal inputs
            var input_rngs = dag.terminalInputVectors();

            // first idx: the index of the TreeNode in the "inputs" array
            // second idx: the ith bootstrap
            var resamples = new InputSample[input_rngs.Length][];

            // RNG for sampling
            var rng = new Random();

            // we save initial inputs and outputs here
            var initial_inputs  = StoreInputs(input_rngs, dag);
            var initial_outputs = StoreOutputs(output_fns, dag);

            // Set progress bar max
            pb.setMax(input_rngs.Length * 2);

            #region RESAMPLE

            // populate bootstrap array
            // for each input range (a TreeNode)
            for (int i = 0; i < input_rngs.Length; i++)
            {
                // this TreeNode
                var t = input_rngs[i];

                // resample
                resamples[i] = Resample(num_bootstraps, initial_inputs[t], rng);

                // update progress bar
                pb.IncrementProgress();
            }

            #endregion RESAMPLE

            #region INFERENCE
            return(Inference(
                       num_bootstraps,
                       resamples,
                       initial_inputs,
                       initial_outputs,
                       input_rngs,
                       output_fns,
                       dag,
                       weighted,
                       significance,
                       pb));

            #endregion INFERENCE
        }
Example #11
0
        public void Analyze(long max_duration_in_ms)
        {
            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            using (var pb = new ProgBar())
            {
                // Disable screen updating during analysis to speed things up
                _app.ScreenUpdating = false;

                // Build dependency graph (modifies data)
                try
                {
                    _dag = new DAG(_app.ActiveWorkbook, _app, IGNORE_PARSE_ERRORS);
                    var num_input_cells = _dag.numberOfInputCells();
                }
                catch (ExcelParserUtility.ParseException e)
                {
                    // cleanup UI and then rethrow
                    _app.ScreenUpdating = true;
                    throw e;
                }

                if (_dag.terminalInputVectors().Length == 0)
                {
                    System.Windows.Forms.MessageBox.Show("This spreadsheet contains no vector-input functions.");
                    _app.ScreenUpdating = true;
                    _flaggable = new KeyValuePair<AST.Address, int>[0];
                    return;
                }

                // Get bootstraps
                var scores = Analysis.DataDebug(NBOOTS,
                                                _dag,
                                                _app,
                                                weighted: USE_WEIGHTS,
                                                all_outputs: CONSIDER_ALL_OUTPUTS,
                                                max_duration_in_ms: max_duration_in_ms,
                                                sw: sw,
                                                significance: _tool_significance,
                                                pb: pb)
                                     .OrderByDescending(pair => pair.Value).ToArray();

                if (_debug_mode)
                {
                    var score_str = String.Join("\n", scores.Take(10).Select(score => score.Key.A1FullyQualified() + " -> " + score.Value.ToString()));
                    System.Windows.Forms.MessageBox.Show(score_str);
                    System.Windows.Forms.Clipboard.SetText(score_str);
                }

                List<KeyValuePair<AST.Address, int>> high_scores = new List<KeyValuePair<AST.Address, int>>();

                // calculate cutoff idnex
                int thresh = scores.Length - Convert.ToInt32(scores.Length * _tool_significance);

                // filter out cells that are...
                _flaggable = scores.Where(pair => pair.Value >= scores[thresh].Value)   // below threshold
                                   .Where(pair => !_known_good.Contains(pair.Key))      // known to be good
                                   .Where(pair => pair.Value != 0).ToArray();           // score == 0

                // Enable screen updating when we're done
                _app.ScreenUpdating = true;

                sw.Stop();
            }
        }
Example #12
0
        private static void RunSubletyExperiment(Excel.Application app, Excel.Workbook wb, Random rng, UserSimulation.Classification c, string output_dir, double thresh, ProgBar pb)
        {
            // number of bootstraps
            var NBOOTS = 2700;

            // the full path of this workbook
            var filename = app.ActiveWorkbook.Name;

            // the default output filename
            var r = new System.Text.RegularExpressions.Regex(@"(.+)\.xls|xlsx", System.Text.RegularExpressions.RegexOptions.Compiled);
            var default_output_file = "simulation_results.csv";
            var default_log_file = r.Match(filename).Groups[1].Value + ".iterlog.csv";

            // save file location (will append for additional runs)
            var savefile = System.IO.Path.Combine(output_dir, default_output_file);

            // log file location (new file for each new workbook)
            var logfile = System.IO.Path.Combine(output_dir, default_log_file);

            // disable screen updating
            app.ScreenUpdating = false;

            // run simulations
            if (!UserSimulation.Config.RunSubletyExperiment(app, wb, NBOOTS, 0.95, thresh, c, rng, savefile, MAX_DURATION_IN_MS, logfile, pb, IGNORE_PARSE_ERRORS))
            {
                System.Windows.Forms.MessageBox.Show("This spreadsheet contains no numeric inputs.");
            }

            // enable screen updating
            app.ScreenUpdating = true;
        }
Example #13
0
        // remove errors until none remain
        private UserResults SimulateUser(int nboots,
            double significance,
            CutoffKind ck,
            DAG dag,
            CellDict original_inputs,
            CellDict errord,
            CellDict correct_outputs,
            Excel.Workbook wb,
            Excel.Application app,
            AnalysisType analysis_type,
            bool weighted,
            bool all_outputs,
            long max_duration_in_ms,
            Stopwatch sw,
            String logfile,
            ProgBar pb
            )
        {
            // init user results data structure
            var o = new UserResults();
            HashSet<AST.Address> known_good = new HashSet<AST.Address>();

            // initialize procedure
            var errors_remain = true;
            var max_errors = new ErrorDict();
            var incorrect_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
            var errors_found = 0;
            var number_of_true_errors = errord.Count;
            Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors);

            // the corrected state of the spreadsheet
            CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value);

            // remove errors loop
            var cells_inspected = 0;
            List<KeyValuePair<AST.Address, int>> filtered_high_scores = null;
            bool correction_made = true;
            while (errors_remain)
            {
                Console.Write(".");

                AST.Address flagged_cell = null;

                // choose the appropriate test
                if (analysis_type == AnalysisType.CheckCell5 ||
                    analysis_type == AnalysisType.CheckCell10
                    )

                {
                    flagged_cell = SimulationStep.CheckCell_Step(o,
                                                  significance,
                                                  ck,
                                                  nboots,
                                                  dag,
                                                  app,
                                                  weighted,
                                                  all_outputs,
                                                  correction_made,
                                                  known_good,
                                                  ref filtered_high_scores,
                                                  max_duration_in_ms,
                                                  sw,
                                                  pb);
                } else if (analysis_type == AnalysisType.NormalPerRange)
                {
                    flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw);
                }
                else if (analysis_type == AnalysisType.NormalAllInputs)
                {
                    flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw);
                }

                // stop if the test no longer returns anything or if
                // the test is simply done inspecting based on a fixed threshold
                if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected))
                {
                    errors_remain = false;
                }
                else    // a cell was flagged
                {
                    //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged,
                    //then nothing is inspected, so cells_inspected doesn't increase.
                    cells_inspected += 1;

                    // check to see if the flagged value is actually an error
                    if (errord.ContainsKey(flagged_cell))
                    {
                        correction_made = true;
                        errors_found += 1;
                        // P(k) * rel(k)
                        o.PrecRel_at_k.Add(errors_found / (double)cells_inspected);
                        o.true_positives.Add(flagged_cell);

                        // correct flagged cell
                        flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell];

                        Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors);

                        // compute total error after applying this correction
                        var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs);
                        o.current_total_error.Add(current_total_error);

                        // save outputs
                        partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
                    }
                    else
                    {
                        correction_made = false;
                        // numerator is 0 here because rel(k) = 0 when no error was found
                        o.PrecRel_at_k.Add(0.0);
                        o.false_positives.Add(flagged_cell);
                    }

                    // mark it as known good -- at this point the cell has been
                    //      'inspected' regardless of whether it was an error
                    //      It was either corrected or marked as OK
                    known_good.Add(flagged_cell);

                    // compute output error magnitudes
                    var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);
                    // compute input error magnitude
                    double num_input_error_magnitude;
                    double str_input_error_magnitude;
                    if (errord.ContainsKey(flagged_cell))
                    {
                        if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell]))
                        {
                            num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell]));
                            str_input_error_magnitude = 0;
                        }
                        else
                        {
                            num_input_error_magnitude = 0;
                            str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]);
                        }
                    }
                    else
                    {
                        num_input_error_magnitude = 0;
                        str_input_error_magnitude = 0;
                    }

                    // write error log
                    var logentry = new LogEntry(analysis_type,
                                                wb.Name,
                                                flagged_cell,
                                                original_inputs[flagged_cell],
                                                errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell],
                                                output_error_magnitude,
                                                num_input_error_magnitude,
                                                str_input_error_magnitude,
                                                true,
                                                correction_made,
                                                significance,
                                                ck.Threshold);
                    logentry.WriteLog(logfile);
                    _error_log.Add(logentry);
                }
            }

            // find all of the false negatives
            o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord);
            o.max_errors = max_errors;

            var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);

            // write out all false negative information
            foreach (AST.Address fn in o.false_negatives)
            {
                double num_input_error_magnitude;
                double str_input_error_magnitude;
                if (Utility.BothNumbers(errord[fn], original_inputs[fn]))
                {
                    num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn]));
                    str_input_error_magnitude = 0;
                }
                else
                {
                    num_input_error_magnitude = 0;
                    str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]);
                }

                // write error log
                _error_log.Add(new LogEntry(analysis_type,
                                            wb.Name,
                                            fn,
                                            original_inputs[fn],
                                            errord[fn],
                                            last_out_err_mag,
                                            num_input_error_magnitude,
                                            str_input_error_magnitude,
                                            false,
                                            true,
                                            significance,
                                            ck.Threshold));
            }
            return o;
        }
Example #14
0
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            CutoffKind ck,              // kind of threshold function to use
            Excel.Application app,      // reference to Excel app
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,
            Excel.Workbook wb,
            AST.Address[] terminal_formula_cells,
            AST.Range[] terminal_input_vectors,
            CellDict original_inputs,
            CellDict correct_outputs,
            long max_duration_in_ms,
            String logfile,              //filename for the output log
            ProgBar pb
            )
        {
            //set wbname and path
            _wb_name = xlfile;
            _wb_path = wb.Path;
            _analysis_type = analysisType;
            _significance = significance;
            _all_outputs = all_outputs;
            _weighted = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();
            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;
            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);
            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort = dag.allCells().Length;
            _effort = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return _effort;
        }