Beispiel #1
0
        // this function returns an address but also updates
        // the filtered_high_scores list
        public static AST.Address CheckCell_Step(UserResults o,
                                                 double significance,
                                                 CutoffKind ck,
                                                 int nboots,
                                                 DAG dag,
                                                 Excel.Application app,
                                                 bool weighted,
                                                 bool all_outputs,
                                                 bool run_bootstrap,
                                                 HashSet <AST.Address> known_good,
                                                 ref List <KeyValuePair <AST.Address, int> > filtered_high_scores,
                                                 long max_duration_in_ms,
                                                 Stopwatch sw,
                                                 ProgBar pb)
        {
            // Get bootstraps
            // The bootstrap should only re-run if there is a correction made,
            //      not when something is marked as OK (isn't one of the introduced errors)
            // The list of suspected cells doesn't change when we mark something as OK,
            //      we just move on to the next thing in the list
            if (run_bootstrap)
            {
                TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb);

                // apply a threshold to the scores
                filtered_high_scores = ck.applyCutoff(scores, known_good);
            }
            else  //if no corrections were made (a cell was marked as OK, not corrected)
            {
                //re-filter out cells marked as OK
                filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList();
            }

            if (filtered_high_scores.Count() != 0)
            {
                // get AST.Address corresponding to most unusual score
                return(filtered_high_scores[0].Key);
            }
            else
            {
                return(null);
            }
        }
        // this function returns an address but also updates
        // the filtered_high_scores list
        public static AST.Address CheckCell_Step(UserResults o,
            double significance,
            CutoffKind ck,
            int nboots,
            DAG dag,
            Excel.Application app,
            bool weighted,
            bool all_outputs,
            bool run_bootstrap,
            HashSet<AST.Address> known_good,
            ref List<KeyValuePair<AST.Address, int>> filtered_high_scores,
            long max_duration_in_ms,
            Stopwatch sw,
            ProgBar pb)
        {
            // Get bootstraps
            // The bootstrap should only re-run if there is a correction made,
            //      not when something is marked as OK (isn't one of the introduced errors)
            // The list of suspected cells doesn't change when we mark something as OK,
            //      we just move on to the next thing in the list
            if (run_bootstrap)
            {
                TreeScore scores = Analysis.DataDebug(nboots, dag, app, weighted, all_outputs, max_duration_in_ms, sw, significance, pb);

                // apply a threshold to the scores
                filtered_high_scores = ck.applyCutoff(scores, known_good);
            }
            else  //if no corrections were made (a cell was marked as OK, not corrected)
            {
                //re-filter out cells marked as OK
                filtered_high_scores = filtered_high_scores.Where(kvp => !known_good.Contains(kvp.Key)).ToList();
            }

            if (filtered_high_scores.Count() != 0)
            {
                // get AST.Address corresponding to most unusual score
                return filtered_high_scores[0].Key;
            }
            else
            {
                return null;
            }
        }
Beispiel #3
0
        // remove errors until none remain
        private UserResults SimulateUser(int nboots,
                                         double significance,
                                         CutoffKind ck,
                                         DAG dag,
                                         CellDict original_inputs,
                                         CellDict errord,
                                         CellDict correct_outputs,
                                         Excel.Workbook wb,
                                         Excel.Application app,
                                         AnalysisType analysis_type,
                                         bool weighted,
                                         bool all_outputs,
                                         long max_duration_in_ms,
                                         Stopwatch sw,
                                         String logfile,
                                         ProgBar pb
                                         )
        {
            // init user results data structure
            var o = new UserResults();
            HashSet <AST.Address> known_good = new HashSet <AST.Address>();

            // initialize procedure
            var errors_remain         = true;
            var max_errors            = new ErrorDict();
            var incorrect_outputs     = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
            var errors_found          = 0;
            var number_of_true_errors = errord.Count;

            Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors);

            // the corrected state of the spreadsheet
            CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value);

            // remove errors loop
            var cells_inspected = 0;
            List <KeyValuePair <AST.Address, int> > filtered_high_scores = null;
            bool correction_made = true;

            while (errors_remain)
            {
                Console.Write(".");

                AST.Address flagged_cell = null;

                // choose the appropriate test
                if (analysis_type == AnalysisType.CheckCell5 ||
                    analysis_type == AnalysisType.CheckCell10
                    )

                {
                    flagged_cell = SimulationStep.CheckCell_Step(o,
                                                                 significance,
                                                                 ck,
                                                                 nboots,
                                                                 dag,
                                                                 app,
                                                                 weighted,
                                                                 all_outputs,
                                                                 correction_made,
                                                                 known_good,
                                                                 ref filtered_high_scores,
                                                                 max_duration_in_ms,
                                                                 sw,
                                                                 pb);
                }
                else if (analysis_type == AnalysisType.NormalPerRange)
                {
                    flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw);
                }
                else if (analysis_type == AnalysisType.NormalAllInputs)
                {
                    flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw);
                }

                // stop if the test no longer returns anything or if
                // the test is simply done inspecting based on a fixed threshold
                if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected))
                {
                    errors_remain = false;
                }
                else    // a cell was flagged
                {
                    //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged,
                    //then nothing is inspected, so cells_inspected doesn't increase.
                    cells_inspected += 1;

                    // check to see if the flagged value is actually an error
                    if (errord.ContainsKey(flagged_cell))
                    {
                        correction_made = true;
                        errors_found   += 1;
                        // P(k) * rel(k)
                        o.PrecRel_at_k.Add(errors_found / (double)cells_inspected);
                        o.true_positives.Add(flagged_cell);

                        // correct flagged cell
                        flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell];

                        Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors);

                        // compute total error after applying this correction
                        var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs);
                        o.current_total_error.Add(current_total_error);

                        // save outputs
                        partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
                    }
                    else
                    {
                        correction_made = false;
                        // numerator is 0 here because rel(k) = 0 when no error was found
                        o.PrecRel_at_k.Add(0.0);
                        o.false_positives.Add(flagged_cell);
                    }

                    // mark it as known good -- at this point the cell has been
                    //      'inspected' regardless of whether it was an error
                    //      It was either corrected or marked as OK
                    known_good.Add(flagged_cell);

                    // compute output error magnitudes
                    var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);
                    // compute input error magnitude
                    double num_input_error_magnitude;
                    double str_input_error_magnitude;
                    if (errord.ContainsKey(flagged_cell))
                    {
                        if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell]))
                        {
                            num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell]));
                            str_input_error_magnitude = 0;
                        }
                        else
                        {
                            num_input_error_magnitude = 0;
                            str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]);
                        }
                    }
                    else
                    {
                        num_input_error_magnitude = 0;
                        str_input_error_magnitude = 0;
                    }

                    // write error log
                    var logentry = new LogEntry(analysis_type,
                                                wb.Name,
                                                flagged_cell,
                                                original_inputs[flagged_cell],
                                                errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell],
                                                output_error_magnitude,
                                                num_input_error_magnitude,
                                                str_input_error_magnitude,
                                                true,
                                                correction_made,
                                                significance,
                                                ck.Threshold);
                    logentry.WriteLog(logfile);
                    _error_log.Add(logentry);
                }
            }

            // find all of the false negatives
            o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord);
            o.max_errors      = max_errors;

            var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);

            // write out all false negative information
            foreach (AST.Address fn in o.false_negatives)
            {
                double num_input_error_magnitude;
                double str_input_error_magnitude;
                if (Utility.BothNumbers(errord[fn], original_inputs[fn]))
                {
                    num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn]));
                    str_input_error_magnitude = 0;
                }
                else
                {
                    num_input_error_magnitude = 0;
                    str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]);
                }

                // write error log
                _error_log.Add(new LogEntry(analysis_type,
                                            wb.Name,
                                            fn,
                                            original_inputs[fn],
                                            errord[fn],
                                            last_out_err_mag,
                                            num_input_error_magnitude,
                                            str_input_error_magnitude,
                                            false,
                                            true,
                                            significance,
                                            ck.Threshold));
            }
            return(o);
        }
Beispiel #4
0
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
                       string xlfile,              // name of the workbook
                       double significance,        // significance threshold for test
                       CutoffKind ck,              // kind of threshold function to use
                       Excel.Application app,      // reference to Excel app
                       Classification c,           // data from which to generate errors
                       Random r,                   // a random number generator
                       AnalysisType analysisType,  // the type of analysis to run
                       bool weighted,              // should we weigh things?
                       bool all_outputs,           // if !all_outputs, we only consider terminal outputs
                       DAG dag,
                       Excel.Workbook wb,
                       AST.Address[] terminal_formula_cells,
                       AST.Range[] terminal_input_vectors,
                       CellDict original_inputs,
                       CellDict correct_outputs,
                       long max_duration_in_ms,
                       String logfile,               //filename for the output log
                       ProgBar pb
                       )
        {
            //set wbname and path
            _wb_name       = xlfile;
            _wb_path       = wb.Path;
            _analysis_type = analysisType;
            _significance  = significance;
            _all_outputs   = all_outputs;
            _weighted      = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();

            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;

            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);

            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort      = dag.allCells().Length;
            _effort          = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return(_effort);
        }
Beispiel #5
0
        // remove errors until none remain
        private UserResults SimulateUser(int nboots,
            double significance,
            CutoffKind ck,
            DAG dag,
            CellDict original_inputs,
            CellDict errord,
            CellDict correct_outputs,
            Excel.Workbook wb,
            Excel.Application app,
            AnalysisType analysis_type,
            bool weighted,
            bool all_outputs,
            long max_duration_in_ms,
            Stopwatch sw,
            String logfile,
            ProgBar pb
            )
        {
            // init user results data structure
            var o = new UserResults();
            HashSet<AST.Address> known_good = new HashSet<AST.Address>();

            // initialize procedure
            var errors_remain = true;
            var max_errors = new ErrorDict();
            var incorrect_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
            var errors_found = 0;
            var number_of_true_errors = errord.Count;
            Utility.UpdatePerFunctionMaxError(correct_outputs, incorrect_outputs, max_errors);

            // the corrected state of the spreadsheet
            CellDict partially_corrected_outputs = correct_outputs.ToDictionary(p => p.Key, p => p.Value);

            // remove errors loop
            var cells_inspected = 0;
            List<KeyValuePair<AST.Address, int>> filtered_high_scores = null;
            bool correction_made = true;
            while (errors_remain)
            {
                Console.Write(".");

                AST.Address flagged_cell = null;

                // choose the appropriate test
                if (analysis_type == AnalysisType.CheckCell5 ||
                    analysis_type == AnalysisType.CheckCell10
                    )

                {
                    flagged_cell = SimulationStep.CheckCell_Step(o,
                                                  significance,
                                                  ck,
                                                  nboots,
                                                  dag,
                                                  app,
                                                  weighted,
                                                  all_outputs,
                                                  correction_made,
                                                  known_good,
                                                  ref filtered_high_scores,
                                                  max_duration_in_ms,
                                                  sw,
                                                  pb);
                } else if (analysis_type == AnalysisType.NormalPerRange)
                {
                    flagged_cell = SimulationStep.NormalPerRange_Step(dag, wb, known_good, max_duration_in_ms, sw);
                }
                else if (analysis_type == AnalysisType.NormalAllInputs)
                {
                    flagged_cell = SimulationStep.NormalAllOutputs_Step(dag, app, wb, known_good, max_duration_in_ms, sw);
                }

                // stop if the test no longer returns anything or if
                // the test is simply done inspecting based on a fixed threshold
                if (flagged_cell == null || (ck.isCountBased && ck.Threshold == cells_inspected))
                {
                    errors_remain = false;
                }
                else    // a cell was flagged
                {
                    //cells_inspected should only be incremented when a cell is actually flagged. If nothing is flagged,
                    //then nothing is inspected, so cells_inspected doesn't increase.
                    cells_inspected += 1;

                    // check to see if the flagged value is actually an error
                    if (errord.ContainsKey(flagged_cell))
                    {
                        correction_made = true;
                        errors_found += 1;
                        // P(k) * rel(k)
                        o.PrecRel_at_k.Add(errors_found / (double)cells_inspected);
                        o.true_positives.Add(flagged_cell);

                        // correct flagged cell
                        flagged_cell.GetCOMObject(app).Value2 = original_inputs[flagged_cell];

                        Utility.UpdatePerFunctionMaxError(correct_outputs, partially_corrected_outputs, max_errors);

                        // compute total error after applying this correction
                        var current_total_error = Utility.CalculateTotalError(correct_outputs, partially_corrected_outputs);
                        o.current_total_error.Add(current_total_error);

                        // save outputs
                        partially_corrected_outputs = Utility.SaveOutputs(dag.terminalFormulaNodes(all_outputs), dag);
                    }
                    else
                    {
                        correction_made = false;
                        // numerator is 0 here because rel(k) = 0 when no error was found
                        o.PrecRel_at_k.Add(0.0);
                        o.false_positives.Add(flagged_cell);
                    }

                    // mark it as known good -- at this point the cell has been
                    //      'inspected' regardless of whether it was an error
                    //      It was either corrected or marked as OK
                    known_good.Add(flagged_cell);

                    // compute output error magnitudes
                    var output_error_magnitude = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);
                    // compute input error magnitude
                    double num_input_error_magnitude;
                    double str_input_error_magnitude;
                    if (errord.ContainsKey(flagged_cell))
                    {
                        if (Utility.BothNumbers(errord[flagged_cell], original_inputs[flagged_cell]))
                        {
                            num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[flagged_cell]), Double.Parse(original_inputs[flagged_cell]));
                            str_input_error_magnitude = 0;
                        }
                        else
                        {
                            num_input_error_magnitude = 0;
                            str_input_error_magnitude = Utility.StringMagnitudeChange(errord[flagged_cell], original_inputs[flagged_cell]);
                        }
                    }
                    else
                    {
                        num_input_error_magnitude = 0;
                        str_input_error_magnitude = 0;
                    }

                    // write error log
                    var logentry = new LogEntry(analysis_type,
                                                wb.Name,
                                                flagged_cell,
                                                original_inputs[flagged_cell],
                                                errord.ContainsKey(flagged_cell) ? errord[flagged_cell] : original_inputs[flagged_cell],
                                                output_error_magnitude,
                                                num_input_error_magnitude,
                                                str_input_error_magnitude,
                                                true,
                                                correction_made,
                                                significance,
                                                ck.Threshold);
                    logentry.WriteLog(logfile);
                    _error_log.Add(logentry);
                }
            }

            // find all of the false negatives
            o.false_negatives = Utility.GetFalseNegatives(o.true_positives, o.false_positives, errord);
            o.max_errors = max_errors;

            var last_out_err_mag = Utility.MeanErrorMagnitude(partially_corrected_outputs, correct_outputs);

            // write out all false negative information
            foreach (AST.Address fn in o.false_negatives)
            {
                double num_input_error_magnitude;
                double str_input_error_magnitude;
                if (Utility.BothNumbers(errord[fn], original_inputs[fn]))
                {
                    num_input_error_magnitude = Utility.NumericalMagnitudeChange(Double.Parse(errord[fn]), Double.Parse(original_inputs[fn]));
                    str_input_error_magnitude = 0;
                }
                else
                {
                    num_input_error_magnitude = 0;
                    str_input_error_magnitude = Utility.StringMagnitudeChange(errord[fn], original_inputs[fn]);
                }

                // write error log
                _error_log.Add(new LogEntry(analysis_type,
                                            wb.Name,
                                            fn,
                                            original_inputs[fn],
                                            errord[fn],
                                            last_out_err_mag,
                                            num_input_error_magnitude,
                                            str_input_error_magnitude,
                                            false,
                                            true,
                                            significance,
                                            ck.Threshold));
            }
            return o;
        }
Beispiel #6
0
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            CutoffKind ck,              // kind of threshold function to use
            Excel.Application app,      // reference to Excel app
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,
            Excel.Workbook wb,
            AST.Address[] terminal_formula_cells,
            AST.Range[] terminal_input_vectors,
            CellDict original_inputs,
            CellDict correct_outputs,
            long max_duration_in_ms,
            String logfile,              //filename for the output log
            ProgBar pb
            )
        {
            //set wbname and path
            _wb_name = xlfile;
            _wb_path = wb.Path;
            _analysis_type = analysisType;
            _significance = significance;
            _all_outputs = all_outputs;
            _weighted = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();
            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;
            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);
            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort = dag.allCells().Length;
            _effort = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return _effort;
        }