コード例 #1
0
 public static Classification Classify(MTurkParser.Data data, string serfile)
 {
     var total_inputs = data.NumInputs;
     var c = new Classification();
     var stringpairs = data.StringPairs.ToArray();
     for (int i = 0; i < stringpairs.Length; i++)
     {
         var original = stringpairs[i].Item1;
         var entered = stringpairs[i].Item2;
         Console.Write("\r{0:P} strings classified", System.Convert.ToDouble(i) / System.Convert.ToDouble(total_inputs));
         c.ProcessTypos(original, entered);
     }
     Console.Write("\n");
     c.Serialize(serfile);
     return c;
 }
コード例 #2
0
        // Get dictionary of inputs and the error they produce
        public Dictionary <AST.Address, Tuple <string, double> > TopOfKErrors(AST.Address[] terminal_formula_nodes, CellDict inputs, int k, CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, string classification_file, DAG dag)
        {
            var eg = new ErrorGenerator();
            var c  = Classification.Deserialize(classification_file);
            var max_error_produced_dictionary = new Dictionary <AST.Address, Tuple <string, double> >();

            foreach (KeyValuePair <AST.Address, string> pair in inputs)
            {
                AST.Address addr       = pair.Key;
                string      orig_value = pair.Value;

                //Load in the classification's dictionaries
                double max_error_produced = 0.0;
                string max_error_string   = "";

                // get k strings, in parallel
                string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k);

                for (int i = 0; i < k; i++)
                {
                    CellDict cd = new CellDict();
                    cd.Add(addr, errorstrings[i]);
                    //inject the typo
                    Utility.InjectValues(app, wb, cd);

                    // save function outputs
                    CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_nodes, dag);

                    //remove the typo that was introduced
                    cd.Clear();
                    cd.Add(addr, orig_value);
                    Utility.InjectValues(app, wb, cd);

                    double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs);

                    //keep track of the largest observed max error
                    if (total_error > max_error_produced)
                    {
                        max_error_produced = total_error;
                        max_error_string   = errorstrings[i];
                    }
                }
                //Add entry for this TreeNode in our dictionary with its max_error_produced
                max_error_produced_dictionary.Add(addr, new Tuple <string, double>(max_error_string, max_error_produced));
            }
            return(max_error_produced_dictionary);
        }
コード例 #3
0
        public static Classification Classify(MTurkParser.Data data, string serfile)
        {
            var total_inputs = data.NumInputs;
            var c            = new Classification();
            var stringpairs  = data.StringPairs.ToArray();

            for (int i = 0; i < stringpairs.Length; i++)
            {
                var original = stringpairs[i].Item1;
                var entered  = stringpairs[i].Item2;
                Console.Write("\r{0:P} strings classified", System.Convert.ToDouble(i) / System.Convert.ToDouble(total_inputs));
                c.ProcessTypos(original, entered);
            }
            Console.Write("\n");
            c.Serialize(serfile);
            return(c);
        }
コード例 #4
0
 private Dictionary <int, double> GetDistributionOfTranspositions(Classification classification)
 {
     //if we have already generated a distribution, return it
     if (_transpositions_distribution_dict.Count != 0)
     {
         return(_transpositions_distribution_dict);
     }
     else //otherwise generate the distribution and then return it
     {
         _transpositions_distribution_dict = GenerateTranspositionsDistribution(classification);
         //If our dictionary does not have any information about transpositions, we add to it delta = 0 with probability 1.0
         if (_transpositions_distribution_dict.Count == 0)
         {
             _transpositions_distribution_dict.Add(0, 1.0);
         }
         return(_transpositions_distribution_dict);
     }
 }
コード例 #5
0
        // this method only works for functions with numerical inputs
        public string GenerateSubtleErrorString(double input, Classification c)
        {
            string errstr;
            double errmag = 100;

            do
            {
                // generate an error
                errstr = GenerateErrorString(Convert.ToString(input), c);
                double errval;
                if (Double.TryParse(errstr, out errval))
                {
                    // it's a numerical error
                    // get the magnitude of the error
                    errmag = Utility.NumericalMagnitudeChange(errval, input);
                }
            } while (errmag >= 0);
            return(errstr);
        }
コード例 #6
0
        //Generates the distribution of strings for a particular character given a classification
        private Dictionary <string, double> GenerateDistributionForChar(OptChar c, Classification classification)
        {
            var typo_dict = classification.GetTypoDict();
            var kvps      = typo_dict.Where(pair => {
                if (OptChar.get_IsNone(pair.Key.Item1))
                {
                    if (OptChar.get_IsNone(c))
                    {
                        return(true);
                    }
                    return(false);
                }
                else
                {
                    return(pair.Key.Item1.Equals(c));
                }
            }).ToArray();
            var sum          = kvps.Select(pair => pair.Value).Sum();
            var distribution = kvps.Select(pair => new KeyValuePair <string, double>(pair.Key.Item2, (double)pair.Value / sum));

            return(distribution.ToDictionary(pair => pair.Key, pair => pair.Value));
        }
コード例 #7
0
        //Gets the distribution of strings for a particular character
        //DOES NOT use previously generated distributions; generates the distribution every time
        private Dictionary <string, double> GetDistributionOfStringsForChar(OptChar c, Classification classification)
        {
            OptChar key = c;
            Dictionary <string, double> distribution;

            //Generate the probability distribution based on the classification, which contains counts of observations
            distribution = GenerateDistributionForChar(key, classification);
            //If our dictionary does not have any information about this character, we return the character with probability 1.0
            if (distribution.Count == 0)
            {
                distribution.Add("" + c.Value, 1.0);
            }
            return(distribution);
        }
コード例 #8
0
ファイル: Simulation.cs プロジェクト: plasma-umass/DataDebug
        // For running a simulation from the batch runner
        // returns the number of cells inspected
        public int RunFromBatch(int nboots,        // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            Excel.Application app,      // reference to Excel app
            CutoffKind ck,
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,          // the computation tree of the spreadsheet
            Excel.Workbook wb,          // the workbook being analyzed
            CellDict errors,            // the errors that will be introduced in the spreadsheet
            AST.Range[] terminal_input_vectors,   // the inputs
            AST.Address[] terminal_formula_cells, // the outputs
            CellDict original_inputs,          // original values of the inputs
            CellDict correct_outputs,          // the correct outputs
            long max_duration_in_ms,
            String logfile              //filename for the output log
            )
        {
            if (terminal_input_vectors.Length == 0)
            {
                throw new NoRangeInputs();
            }

            if (original_inputs.Count() == 0)
            {
                throw new NoFormulas();
            }

            _errors = errors;

            // find the error with the largest magnitude
            // this is mostly useful for the single-perturbation experiments
            var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));
            var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));

            _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key]))
                    ).Max() : 0;
            _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key])
                    ).Max() : 0;

            // find the output with the largest magnitude
            var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value));
            var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value));

            _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key]))
                    ).Max() : 0;
            _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select(
                (KeyValuePair<AST.Address, string> pair) =>
                    Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key])
                    ).Max() : 0;

            return Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null);
        }
コード例 #9
0
ファイル: Simulation.cs プロジェクト: plasma-umass/DataDebug
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
            string xlfile,              // name of the workbook
            double significance,        // significance threshold for test
            CutoffKind ck,              // kind of threshold function to use
            Excel.Application app,      // reference to Excel app
            Classification c,           // data from which to generate errors
            Random r,                   // a random number generator
            AnalysisType analysisType,  // the type of analysis to run
            bool weighted,              // should we weigh things?
            bool all_outputs,           // if !all_outputs, we only consider terminal outputs
            DAG dag,
            Excel.Workbook wb,
            AST.Address[] terminal_formula_cells,
            AST.Range[] terminal_input_vectors,
            CellDict original_inputs,
            CellDict correct_outputs,
            long max_duration_in_ms,
            String logfile,              //filename for the output log
            ProgBar pb
            )
        {
            //set wbname and path
            _wb_name = xlfile;
            _wb_path = wb.Path;
            _analysis_type = analysisType;
            _significance = significance;
            _all_outputs = all_outputs;
            _weighted = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();
            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;
            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);
            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort = dag.allCells().Length;
            _effort = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return _effort;
        }
コード例 #10
0
 //Generates the distribution of strings for a particular character given a classification
 private Dictionary<string, double> GenerateDistributionForChar(OptChar c, Classification classification)
 {
     var typo_dict = classification.GetTypoDict();
     var kvps = typo_dict.Where(pair => {
         if (OptChar.get_IsNone(pair.Key.Item1))
         {
             if (OptChar.get_IsNone(c))
             {
                 return true;
             }
             return false;
         }
         else
         {
             return pair.Key.Item1.Equals(c);
         }
     }).ToArray();
     var sum = kvps.Select(pair => pair.Value).Sum();
     var distribution = kvps.Select(pair => new KeyValuePair<string,double>(pair.Key.Item2, (double) pair.Value / sum));
     return distribution.ToDictionary(pair => pair.Key, pair => pair.Value);
 }
コード例 #11
0
        //Gets the distribution of strings for a particular character
        //If the distribution has been generated before, it is reused from the _char_distributions_dict
        private Dictionary <string, double> GetDistributionOfStringsForCharReuse(OptChar c, Classification classification)
        {
            OptChar key = c;
            Dictionary <string, double> distribution;

            //if we have already generated a distribution for this character, return it
            if (_char_distributions_dict.TryGetValue(key, out distribution))
            {
                return(distribution);
            }
            //otherwise generate the distribution and then return it
            else
            {
                distribution = GenerateDistributionForChar(key, classification);
                //If our dictionary does not have any information about this character, we return the character with probability 1.0
                if (distribution.Count == 0)
                {
                    distribution.Add("" + c.Value, 1.0);
                }
                _char_distributions_dict.Add(key, distribution);
                return(distribution);
            }
        }
コード例 #12
0
        // For running a simulation from the batch runner
        // returns the number of cells inspected
        public int RunFromBatch(int nboots,                           // number of bootstraps
                                string xlfile,                        // name of the workbook
                                double significance,                  // significance threshold for test
                                Excel.Application app,                // reference to Excel app
                                CutoffKind ck,
                                Classification c,                     // data from which to generate errors
                                Random r,                             // a random number generator
                                AnalysisType analysisType,            // the type of analysis to run
                                bool weighted,                        // should we weigh things?
                                bool all_outputs,                     // if !all_outputs, we only consider terminal outputs
                                DAG dag,                              // the computation tree of the spreadsheet
                                Excel.Workbook wb,                    // the workbook being analyzed
                                CellDict errors,                      // the errors that will be introduced in the spreadsheet
                                AST.Range[] terminal_input_vectors,   // the inputs
                                AST.Address[] terminal_formula_cells, // the outputs
                                CellDict original_inputs,             // original values of the inputs
                                CellDict correct_outputs,             // the correct outputs
                                long max_duration_in_ms,
                                String logfile                        //filename for the output log
                                )
        {
            if (terminal_input_vectors.Length == 0)
            {
                throw new NoRangeInputs();
            }

            if (original_inputs.Count() == 0)
            {
                throw new NoFormulas();
            }

            _errors = errors;

            // find the error with the largest magnitude
            // this is mostly useful for the single-perturbation experiments
            var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));
            var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key]));

            _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select(
                (KeyValuePair <AST.Address, string> pair) =>
                Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key]))
                ).Max() : 0;
            _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select(
                (KeyValuePair <AST.Address, string> pair) =>
                Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key])
                ).Max() : 0;

            // find the output with the largest magnitude
            var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value));
            var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value));

            _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select(
                (KeyValuePair <AST.Address, string> pair) =>
                Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key]))
                ).Max() : 0;
            _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select(
                (KeyValuePair <AST.Address, string> pair) =>
                Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key])
                ).Max() : 0;

            return(Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null));
        }
コード例 #13
0
 // this method only works for functions with numerical inputs
 public string GenerateSubtleErrorString(double input, Classification c)
 {
     string errstr;
     double errmag = 100;
     do
     {
         // generate an error
         errstr = GenerateErrorString(Convert.ToString(input), c);
         double errval;
         if (Double.TryParse(errstr, out errval))
         {
             // it's a numerical error
             // get the magnitude of the error
             errmag = Utility.NumericalMagnitudeChange(errval, input);
         }
     } while (errmag >= 0);
     return errstr;
 }
コード例 #14
0
 private Dictionary<int, double> GenerateTranspositionsDistribution(Classification classification)
 {
     var transposition_dict = classification.GetTranspositionDict();
     var sum = transposition_dict.Select(pair => pair.Value).Sum();
     var distribution = transposition_dict.Select(pair => new KeyValuePair<int, double>(pair.Key, (double)pair.Value / sum));
     return distribution.ToDictionary(pair => pair.Key, pair => pair.Value);
 }
コード例 #15
0
 //Gets the distribution of strings for a particular character
 //DOES NOT use previously generated distributions; generates the distribution every time
 private Dictionary<string, double> GetDistributionOfStringsForChar(OptChar c, Classification classification)
 {
     OptChar key = c;
     Dictionary<string, double> distribution;
     //Generate the probability distribution based on the classification, which contains counts of observations
     distribution = GenerateDistributionForChar(key, classification);
     //If our dictionary does not have any information about this character, we return the character with probability 1.0
     if (distribution.Count == 0)
     {
         distribution.Add("" + c.Value, 1.0);
     }
     return distribution;
 }
コード例 #16
0
 private Dictionary<int, double> GetDistributionOfTranspositions(Classification classification)
 {
     //if we have already generated a distribution, return it
     if (_transpositions_distribution_dict.Count != 0)
     {
         return _transpositions_distribution_dict;
     }
     else //otherwise generate the distribution and then return it
     {
         _transpositions_distribution_dict = GenerateTranspositionsDistribution(classification);
         //If our dictionary does not have any information about transpositions, we add to it delta = 0 with probability 1.0
         if (_transpositions_distribution_dict.Count == 0)
         {
             _transpositions_distribution_dict.Add(0, 1.0);
         }
         return _transpositions_distribution_dict;
     }
 }
コード例 #17
0
        public string GenerateErrorString(string input, Classification c)
        {
            // get typo dict
            var td = c.GetTypoDict();

            // get transposition dict
            var trd = c.GetTranspositionDict();

            // convert the input into a char array
            var ochars = StringToOptCharArray(input);

            // add leading and trailing 'empty characters'
            var inputchars = AddLeadingTrailingSpace(ochars);

            // calculate the marginal probabilities of NOT making a typo for each char in input
            double[] PrsCharNotTypo = inputchars.Select(oc =>
            {
                var key = new Tuple<OptChar, string>(oc, OptCharToString(oc));
                int count;
                if (!td.TryGetValue(key, out count)) {
                    count = 0;
                }
                // funny case to handle the fact that FSharpOption.None == null
                var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc));
                int total = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value);
                if (total == 0)
                {
                    return 1.0;
                }
                else
                {
                    return (double)count / total;
                }
            }).ToArray();

            // calculate the probability of making at least one error
            // might need log-probs here
            double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo);

            // calculate the marginal probabilities of NOT making a
            // transposition for each position in the input
            // note that we do NOT consider the empty strings here
            // For strings of length 1, the probability of not making a
            // transposition should be exactly 1.
            double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) =>
            {
                int count;
                if (!trd.TryGetValue(0, out count)) {
                    count = 0;
                }
                int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum();
                if (total == 0)
                {
                    return 1.0;
                }
                else
                {
                    return (double)count / total;
                }
            }).ToArray() : new [] { 1.0 };

            // calculate the probability of having at least one transposition
            double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans);

            // calculate the relative probability of making a typo vs a transposition
            double RelPrTypo = PrTypo / (PrTypo + PrTrans);

            // init with original input in case typos/transpositions prove to be impossible
            string output = input;

            // the while loop ensures that we do not return an unmodified string.
            // for most strings, returning an unmodified string is very unlikely
            do
            {
                // flip a coin to determine whether our guaranteed error is a typo or a transposition
                if (r.NextDouble() < RelPrTypo)
                {   // is a typo
                    // determine the index of the guaranteed typo
                    double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray();
                    // if there are no possible typos then we just can't produce one
                    if (PrsMistype.Sum() == 0)
                    {
                        break;
                    }
                    var i = MultinomialSample(PrsMistype);
                    // run transposition algorithm & add leading/trailing empty chars
                    // we set the guaranteed transposition index to -1 to ensure that no
                    // transpositions are guaranteed
                    OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1));
                    // run typo algorithm (adjust i for leading space)
                    output = Typoize(input_t, td, i);
                }
                else
                {   // is a transposition
                    // determine the index of the guaranteed transposition
                    double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray();
                    // if there are no possible transpositions then we just can't produce one
                    if (PrsMistype.Sum() == 0)
                    {
                        break;
                    }
                    var i = MultinomialSample(PrsMistype);
                    // run transposition algorithm & add leading/trailing empty chars
                    OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i));
                    // run typo algorithm; set guaranteed typo index to -1 to ensure that no
                    // typo is guaranteed
                    output = Typoize(input_t, td, -1);
                }
            } while (input == output);

            return output;
        }
コード例 #18
0
 //Gets the distribution of strings for a particular character
 //If the distribution has been generated before, it is reused from the _char_distributions_dict
 private Dictionary<string, double> GetDistributionOfStringsForCharReuse(OptChar c, Classification classification)
 {
     OptChar key = c;
     Dictionary<string, double> distribution;
     //if we have already generated a distribution for this character, return it
     if (_char_distributions_dict.TryGetValue(key, out distribution))
     {
         return distribution;
     }
     //otherwise generate the distribution and then return it
     else
     {
         distribution = GenerateDistributionForChar(key, classification);
         //If our dictionary does not have any information about this character, we return the character with probability 1.0
         if (distribution.Count == 0)
         {
             distribution.Add("" + c.Value, 1.0);
         }
         _char_distributions_dict.Add(key, distribution);
         return distribution;
     }
 }
コード例 #19
0
        // returns the number of cells inspected
        public int Run(int nboots,                 // number of bootstraps
                       string xlfile,              // name of the workbook
                       double significance,        // significance threshold for test
                       CutoffKind ck,              // kind of threshold function to use
                       Excel.Application app,      // reference to Excel app
                       Classification c,           // data from which to generate errors
                       Random r,                   // a random number generator
                       AnalysisType analysisType,  // the type of analysis to run
                       bool weighted,              // should we weigh things?
                       bool all_outputs,           // if !all_outputs, we only consider terminal outputs
                       DAG dag,
                       Excel.Workbook wb,
                       AST.Address[] terminal_formula_cells,
                       AST.Range[] terminal_input_vectors,
                       CellDict original_inputs,
                       CellDict correct_outputs,
                       long max_duration_in_ms,
                       String logfile,               //filename for the output log
                       ProgBar pb
                       )
        {
            //set wbname and path
            _wb_name       = xlfile;
            _wb_path       = wb.Path;
            _analysis_type = analysisType;
            _significance  = significance;
            _all_outputs   = all_outputs;
            _weighted      = weighted;

            //Now we want to inject the errors from _errors
            Utility.InjectValues(app, wb, _errors);

            // save function outputs
            CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            //Time the removal of errors
            Stopwatch sw = new Stopwatch();

            sw.Start();

            // remove errors until none remain; MODIFIES WORKBOOK
            _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb);

            sw.Stop();
            TimeSpan elapsed = sw.Elapsed;

            _analysis_time = elapsed.TotalSeconds;

            // save partially-corrected outputs
            var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag);

            // compute total relative error
            _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors);
            _total_relative_error = Utility.TotalRelativeError(_error);

            // compute starting total relative error (normalized by max_errors)
            ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors);

            _initial_total_relative_error = Utility.TotalRelativeError(starting_error);

            // effort
            _max_effort      = dag.allCells().Length;
            _effort          = (_user.true_positives.Count + _user.false_positives.Count);
            _expended_effort = (double)_effort / (double)_max_effort;

            // compute average precision
            // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives|
            // where P(k) is the precision at threshold k,
            // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise
            _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count;

            // restore original values
            Utility.InjectValues(app, wb, original_inputs);

            _tree_construct_time = dag.AnalysisMilliseconds / 1000.0;
            // flag that we're done; safe to print output results
            _simulation_run = true;

            // return the number of cells inspected
            return(_effort);
        }
コード例 #20
0
        public string GenerateErrorString(string input, Classification c)
        {
            // get typo dict
            var td = c.GetTypoDict();

            // get transposition dict
            var trd = c.GetTranspositionDict();

            // convert the input into a char array
            var ochars = StringToOptCharArray(input);

            // add leading and trailing 'empty characters'
            var inputchars = AddLeadingTrailingSpace(ochars);

            // calculate the marginal probabilities of NOT making a typo for each char in input
            double[] PrsCharNotTypo = inputchars.Select(oc =>
            {
                var key = new Tuple <OptChar, string>(oc, OptCharToString(oc));
                int count;
                if (!td.TryGetValue(key, out count))
                {
                    count = 0;
                }
                // funny case to handle the fact that FSharpOption.None == null
                var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc));
                int total     = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value);
                if (total == 0)
                {
                    return(1.0);
                }
                else
                {
                    return((double)count / total);
                }
            }).ToArray();

            // calculate the probability of making at least one error
            // might need log-probs here
            double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo);

            // calculate the marginal probabilities of NOT making a
            // transposition for each position in the input
            // note that we do NOT consider the empty strings here
            // For strings of length 1, the probability of not making a
            // transposition should be exactly 1.
            double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) =>
            {
                int count;
                if (!trd.TryGetValue(0, out count))
                {
                    count = 0;
                }
                int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum();
                if (total == 0)
                {
                    return(1.0);
                }
                else
                {
                    return((double)count / total);
                }
            }).ToArray() : new [] { 1.0 };

            // calculate the probability of having at least one transposition
            double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans);

            // calculate the relative probability of making a typo vs a transposition
            double RelPrTypo = PrTypo / (PrTypo + PrTrans);

            // init with original input in case typos/transpositions prove to be impossible
            string output = input;

            // the while loop ensures that we do not return an unmodified string.
            // for most strings, returning an unmodified string is very unlikely
            do
            {
                // flip a coin to determine whether our guaranteed error is a typo or a transposition
                if (r.NextDouble() < RelPrTypo)
                {   // is a typo
                    // determine the index of the guaranteed typo
                    double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray();
                    // if there are no possible typos then we just can't produce one
                    if (PrsMistype.Sum() == 0)
                    {
                        break;
                    }
                    var i = MultinomialSample(PrsMistype);
                    // run transposition algorithm & add leading/trailing empty chars
                    // we set the guaranteed transposition index to -1 to ensure that no
                    // transpositions are guaranteed
                    OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1));
                    // run typo algorithm (adjust i for leading space)
                    output = Typoize(input_t, td, i);
                }
                else
                {   // is a transposition
                    // determine the index of the guaranteed transposition
                    double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray();
                    // if there are no possible transpositions then we just can't produce one
                    if (PrsMistype.Sum() == 0)
                    {
                        break;
                    }
                    var i = MultinomialSample(PrsMistype);
                    // run transposition algorithm & add leading/trailing empty chars
                    OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i));
                    // run typo algorithm; set guaranteed typo index to -1 to ensure that no
                    // typo is guaranteed
                    output = Typoize(input_t, td, -1);
                }
            } while (input == output);

            return(output);
        }
コード例 #21
0
ファイル: Utility.cs プロジェクト: plasma-umass/DataDebug
        // Get dictionary of inputs and the error they produce
        public static CellDict GenImportantErrors(AST.Address[] output_nodes,
            CellDict inputs,
            int k,         // number of alternatives to consider
            CellDict correct_outputs,
            Excel.Application app,
            Excel.Workbook wb,
            Classification c,
            DAG dag)
        {
            var eg = new ErrorGenerator();
            var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>();

            foreach (KeyValuePair<AST.Address, string> pair in inputs)
            {
                AST.Address addr = pair.Key;
                string orig_value = pair.Value;

                //Load in the classification's dictionaries
                double max_error_produced = 0.0;
                string max_error_string = "";

                // get k strings
                string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k);

                for (int i = 0; i < k; i++)
                {
                    CellDict cd = new CellDict();
                    cd.Add(addr, errorstrings[i]);
                    //inject the typo
                    InjectValues(app, wb, cd);

                    // save function outputs
                    CellDict incorrect_outputs = SaveOutputs(output_nodes, dag);

                    //remove the typo that was introduced
                    cd.Clear();
                    cd.Add(addr, orig_value);
                    InjectValues(app, wb, cd);

                    double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs);

                    //keep track of the largest observed max error
                    if (total_error > max_error_produced)
                    {
                        max_error_produced = total_error;
                        max_error_string = errorstrings[i];
                    }
                }
                //Add entry for this TreeNode in our dictionary with its max_error_produced
                max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced));
            }

            // sort by max_error_produced
            var maxen = max_error_produced_dictionary.OrderByDescending(pair => pair.Value.Item2).Select(pair => new Tuple<AST.Address, string>(pair.Key, pair.Value.Item1)).ToList();

            return maxen.Take((int)Math.Ceiling(0.05 * inputs.Count)).ToDictionary(tup => tup.Item1, tup => tup.Item2);
        }
コード例 #22
0
 public string[] GenerateErrorStrings(string orig, Classification c, int k)
 {
     var strs = new string[k];
     for (int i = 0; i < k; i++)
     {
         strs[i] = GenerateErrorString(orig, c);
     }
     return strs;
 }