public static Classification Classify(MTurkParser.Data data, string serfile) { var total_inputs = data.NumInputs; var c = new Classification(); var stringpairs = data.StringPairs.ToArray(); for (int i = 0; i < stringpairs.Length; i++) { var original = stringpairs[i].Item1; var entered = stringpairs[i].Item2; Console.Write("\r{0:P} strings classified", System.Convert.ToDouble(i) / System.Convert.ToDouble(total_inputs)); c.ProcessTypos(original, entered); } Console.Write("\n"); c.Serialize(serfile); return c; }
// Get dictionary of inputs and the error they produce public Dictionary <AST.Address, Tuple <string, double> > TopOfKErrors(AST.Address[] terminal_formula_nodes, CellDict inputs, int k, CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, string classification_file, DAG dag) { var eg = new ErrorGenerator(); var c = Classification.Deserialize(classification_file); var max_error_produced_dictionary = new Dictionary <AST.Address, Tuple <string, double> >(); foreach (KeyValuePair <AST.Address, string> pair in inputs) { AST.Address addr = pair.Key; string orig_value = pair.Value; //Load in the classification's dictionaries double max_error_produced = 0.0; string max_error_string = ""; // get k strings, in parallel string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k); for (int i = 0; i < k; i++) { CellDict cd = new CellDict(); cd.Add(addr, errorstrings[i]); //inject the typo Utility.InjectValues(app, wb, cd); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_nodes, dag); //remove the typo that was introduced cd.Clear(); cd.Add(addr, orig_value); Utility.InjectValues(app, wb, cd); double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs); //keep track of the largest observed max error if (total_error > max_error_produced) { max_error_produced = total_error; max_error_string = errorstrings[i]; } } //Add entry for this TreeNode in our dictionary with its max_error_produced max_error_produced_dictionary.Add(addr, new Tuple <string, double>(max_error_string, max_error_produced)); } return(max_error_produced_dictionary); }
public static Classification Classify(MTurkParser.Data data, string serfile) { var total_inputs = data.NumInputs; var c = new Classification(); var stringpairs = data.StringPairs.ToArray(); for (int i = 0; i < stringpairs.Length; i++) { var original = stringpairs[i].Item1; var entered = stringpairs[i].Item2; Console.Write("\r{0:P} strings classified", System.Convert.ToDouble(i) / System.Convert.ToDouble(total_inputs)); c.ProcessTypos(original, entered); } Console.Write("\n"); c.Serialize(serfile); return(c); }
private Dictionary <int, double> GetDistributionOfTranspositions(Classification classification) { //if we have already generated a distribution, return it if (_transpositions_distribution_dict.Count != 0) { return(_transpositions_distribution_dict); } else //otherwise generate the distribution and then return it { _transpositions_distribution_dict = GenerateTranspositionsDistribution(classification); //If our dictionary does not have any information about transpositions, we add to it delta = 0 with probability 1.0 if (_transpositions_distribution_dict.Count == 0) { _transpositions_distribution_dict.Add(0, 1.0); } return(_transpositions_distribution_dict); } }
// this method only works for functions with numerical inputs public string GenerateSubtleErrorString(double input, Classification c) { string errstr; double errmag = 100; do { // generate an error errstr = GenerateErrorString(Convert.ToString(input), c); double errval; if (Double.TryParse(errstr, out errval)) { // it's a numerical error // get the magnitude of the error errmag = Utility.NumericalMagnitudeChange(errval, input); } } while (errmag >= 0); return(errstr); }
//Generates the distribution of strings for a particular character given a classification private Dictionary <string, double> GenerateDistributionForChar(OptChar c, Classification classification) { var typo_dict = classification.GetTypoDict(); var kvps = typo_dict.Where(pair => { if (OptChar.get_IsNone(pair.Key.Item1)) { if (OptChar.get_IsNone(c)) { return(true); } return(false); } else { return(pair.Key.Item1.Equals(c)); } }).ToArray(); var sum = kvps.Select(pair => pair.Value).Sum(); var distribution = kvps.Select(pair => new KeyValuePair <string, double>(pair.Key.Item2, (double)pair.Value / sum)); return(distribution.ToDictionary(pair => pair.Key, pair => pair.Value)); }
//Gets the distribution of strings for a particular character //DOES NOT use previously generated distributions; generates the distribution every time private Dictionary <string, double> GetDistributionOfStringsForChar(OptChar c, Classification classification) { OptChar key = c; Dictionary <string, double> distribution; //Generate the probability distribution based on the classification, which contains counts of observations distribution = GenerateDistributionForChar(key, classification); //If our dictionary does not have any information about this character, we return the character with probability 1.0 if (distribution.Count == 0) { distribution.Add("" + c.Value, 1.0); } return(distribution); }
// For running a simulation from the batch runner // returns the number of cells inspected public int RunFromBatch(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test Excel.Application app, // reference to Excel app CutoffKind ck, Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, // the computation tree of the spreadsheet Excel.Workbook wb, // the workbook being analyzed CellDict errors, // the errors that will be introduced in the spreadsheet AST.Range[] terminal_input_vectors, // the inputs AST.Address[] terminal_formula_cells, // the outputs CellDict original_inputs, // original values of the inputs CellDict correct_outputs, // the correct outputs long max_duration_in_ms, String logfile //filename for the output log ) { if (terminal_input_vectors.Length == 0) { throw new NoRangeInputs(); } if (original_inputs.Count() == 0) { throw new NoFormulas(); } _errors = errors; // find the error with the largest magnitude // this is mostly useful for the single-perturbation experiments var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key])) ).Max() : 0; _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key]) ).Max() : 0; // find the output with the largest magnitude var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value)); var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value)); _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key])) ).Max() : 0; _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select( (KeyValuePair<AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key]) ).Max() : 0; return Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null); }
// returns the number of cells inspected public int Run(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test CutoffKind ck, // kind of threshold function to use Excel.Application app, // reference to Excel app Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, Excel.Workbook wb, AST.Address[] terminal_formula_cells, AST.Range[] terminal_input_vectors, CellDict original_inputs, CellDict correct_outputs, long max_duration_in_ms, String logfile, //filename for the output log ProgBar pb ) { //set wbname and path _wb_name = xlfile; _wb_path = wb.Path; _analysis_type = analysisType; _significance = significance; _all_outputs = all_outputs; _weighted = weighted; //Now we want to inject the errors from _errors Utility.InjectValues(app, wb, _errors); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); //Time the removal of errors Stopwatch sw = new Stopwatch(); sw.Start(); // remove errors until none remain; MODIFIES WORKBOOK _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb); sw.Stop(); TimeSpan elapsed = sw.Elapsed; _analysis_time = elapsed.TotalSeconds; // save partially-corrected outputs var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); // compute total relative error _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors); _total_relative_error = Utility.TotalRelativeError(_error); // compute starting total relative error (normalized by max_errors) ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors); _initial_total_relative_error = Utility.TotalRelativeError(starting_error); // effort _max_effort = dag.allCells().Length; _effort = (_user.true_positives.Count + _user.false_positives.Count); _expended_effort = (double)_effort / (double)_max_effort; // compute average precision // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives| // where P(k) is the precision at threshold k, // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count; // restore original values Utility.InjectValues(app, wb, original_inputs); _tree_construct_time = dag.AnalysisMilliseconds / 1000.0; // flag that we're done; safe to print output results _simulation_run = true; // return the number of cells inspected return _effort; }
//Generates the distribution of strings for a particular character given a classification private Dictionary<string, double> GenerateDistributionForChar(OptChar c, Classification classification) { var typo_dict = classification.GetTypoDict(); var kvps = typo_dict.Where(pair => { if (OptChar.get_IsNone(pair.Key.Item1)) { if (OptChar.get_IsNone(c)) { return true; } return false; } else { return pair.Key.Item1.Equals(c); } }).ToArray(); var sum = kvps.Select(pair => pair.Value).Sum(); var distribution = kvps.Select(pair => new KeyValuePair<string,double>(pair.Key.Item2, (double) pair.Value / sum)); return distribution.ToDictionary(pair => pair.Key, pair => pair.Value); }
//Gets the distribution of strings for a particular character //If the distribution has been generated before, it is reused from the _char_distributions_dict private Dictionary <string, double> GetDistributionOfStringsForCharReuse(OptChar c, Classification classification) { OptChar key = c; Dictionary <string, double> distribution; //if we have already generated a distribution for this character, return it if (_char_distributions_dict.TryGetValue(key, out distribution)) { return(distribution); } //otherwise generate the distribution and then return it else { distribution = GenerateDistributionForChar(key, classification); //If our dictionary does not have any information about this character, we return the character with probability 1.0 if (distribution.Count == 0) { distribution.Add("" + c.Value, 1.0); } _char_distributions_dict.Add(key, distribution); return(distribution); } }
// For running a simulation from the batch runner // returns the number of cells inspected public int RunFromBatch(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test Excel.Application app, // reference to Excel app CutoffKind ck, Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, // the computation tree of the spreadsheet Excel.Workbook wb, // the workbook being analyzed CellDict errors, // the errors that will be introduced in the spreadsheet AST.Range[] terminal_input_vectors, // the inputs AST.Address[] terminal_formula_cells, // the outputs CellDict original_inputs, // original values of the inputs CellDict correct_outputs, // the correct outputs long max_duration_in_ms, String logfile //filename for the output log ) { if (terminal_input_vectors.Length == 0) { throw new NoRangeInputs(); } if (original_inputs.Count() == 0) { throw new NoFormulas(); } _errors = errors; // find the error with the largest magnitude // this is mostly useful for the single-perturbation experiments var num_errs = _errors.Where(pair => Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); var str_errs = _errors.Where(pair => !Utility.BothNumbers(pair.Value, original_inputs[pair.Key])); _num_max_err_diff_mag = num_errs.Count() != 0 ? num_errs.Select( (KeyValuePair <AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(original_inputs[pair.Key])) ).Max() : 0; _str_max_err_diff_mag = str_errs.Count() != 0 ? str_errs.Select( (KeyValuePair <AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, original_inputs[pair.Key]) ).Max() : 0; // find the output with the largest magnitude var num_outs = correct_outputs.Where(pair => Utility.IsNumber(pair.Value)); var str_outs = correct_outputs.Where(pair => !Utility.IsNumber(pair.Value)); _num_max_output_diff_mag = num_outs.Count() != 0 ? num_outs.Select( (KeyValuePair <AST.Address, string> pair) => Utility.NumericalMagnitudeChange(Double.Parse(pair.Value), Double.Parse(correct_outputs[pair.Key])) ).Max() : 0; _str_max_output_diff_mag = str_outs.Count() != 0 ? str_outs.Select( (KeyValuePair <AST.Address, string> pair) => Utility.StringMagnitudeChange(pair.Value, correct_outputs[pair.Key]) ).Max() : 0; return(Run(nboots, xlfile, significance, ck, app, c, r, analysisType, weighted, all_outputs, dag, wb, terminal_formula_cells, terminal_input_vectors, original_inputs, correct_outputs, max_duration_in_ms, logfile, null)); }
// this method only works for functions with numerical inputs public string GenerateSubtleErrorString(double input, Classification c) { string errstr; double errmag = 100; do { // generate an error errstr = GenerateErrorString(Convert.ToString(input), c); double errval; if (Double.TryParse(errstr, out errval)) { // it's a numerical error // get the magnitude of the error errmag = Utility.NumericalMagnitudeChange(errval, input); } } while (errmag >= 0); return errstr; }
private Dictionary<int, double> GenerateTranspositionsDistribution(Classification classification) { var transposition_dict = classification.GetTranspositionDict(); var sum = transposition_dict.Select(pair => pair.Value).Sum(); var distribution = transposition_dict.Select(pair => new KeyValuePair<int, double>(pair.Key, (double)pair.Value / sum)); return distribution.ToDictionary(pair => pair.Key, pair => pair.Value); }
//Gets the distribution of strings for a particular character //DOES NOT use previously generated distributions; generates the distribution every time private Dictionary<string, double> GetDistributionOfStringsForChar(OptChar c, Classification classification) { OptChar key = c; Dictionary<string, double> distribution; //Generate the probability distribution based on the classification, which contains counts of observations distribution = GenerateDistributionForChar(key, classification); //If our dictionary does not have any information about this character, we return the character with probability 1.0 if (distribution.Count == 0) { distribution.Add("" + c.Value, 1.0); } return distribution; }
private Dictionary<int, double> GetDistributionOfTranspositions(Classification classification) { //if we have already generated a distribution, return it if (_transpositions_distribution_dict.Count != 0) { return _transpositions_distribution_dict; } else //otherwise generate the distribution and then return it { _transpositions_distribution_dict = GenerateTranspositionsDistribution(classification); //If our dictionary does not have any information about transpositions, we add to it delta = 0 with probability 1.0 if (_transpositions_distribution_dict.Count == 0) { _transpositions_distribution_dict.Add(0, 1.0); } return _transpositions_distribution_dict; } }
public string GenerateErrorString(string input, Classification c) { // get typo dict var td = c.GetTypoDict(); // get transposition dict var trd = c.GetTranspositionDict(); // convert the input into a char array var ochars = StringToOptCharArray(input); // add leading and trailing 'empty characters' var inputchars = AddLeadingTrailingSpace(ochars); // calculate the marginal probabilities of NOT making a typo for each char in input double[] PrsCharNotTypo = inputchars.Select(oc => { var key = new Tuple<OptChar, string>(oc, OptCharToString(oc)); int count; if (!td.TryGetValue(key, out count)) { count = 0; } // funny case to handle the fact that FSharpOption.None == null var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc)); int total = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value); if (total == 0) { return 1.0; } else { return (double)count / total; } }).ToArray(); // calculate the probability of making at least one error // might need log-probs here double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo); // calculate the marginal probabilities of NOT making a // transposition for each position in the input // note that we do NOT consider the empty strings here // For strings of length 1, the probability of not making a // transposition should be exactly 1. double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) => { int count; if (!trd.TryGetValue(0, out count)) { count = 0; } int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum(); if (total == 0) { return 1.0; } else { return (double)count / total; } }).ToArray() : new [] { 1.0 }; // calculate the probability of having at least one transposition double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans); // calculate the relative probability of making a typo vs a transposition double RelPrTypo = PrTypo / (PrTypo + PrTrans); // init with original input in case typos/transpositions prove to be impossible string output = input; // the while loop ensures that we do not return an unmodified string. // for most strings, returning an unmodified string is very unlikely do { // flip a coin to determine whether our guaranteed error is a typo or a transposition if (r.NextDouble() < RelPrTypo) { // is a typo // determine the index of the guaranteed typo double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray(); // if there are no possible typos then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars // we set the guaranteed transposition index to -1 to ensure that no // transpositions are guaranteed OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1)); // run typo algorithm (adjust i for leading space) output = Typoize(input_t, td, i); } else { // is a transposition // determine the index of the guaranteed transposition double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray(); // if there are no possible transpositions then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i)); // run typo algorithm; set guaranteed typo index to -1 to ensure that no // typo is guaranteed output = Typoize(input_t, td, -1); } } while (input == output); return output; }
//Gets the distribution of strings for a particular character //If the distribution has been generated before, it is reused from the _char_distributions_dict private Dictionary<string, double> GetDistributionOfStringsForCharReuse(OptChar c, Classification classification) { OptChar key = c; Dictionary<string, double> distribution; //if we have already generated a distribution for this character, return it if (_char_distributions_dict.TryGetValue(key, out distribution)) { return distribution; } //otherwise generate the distribution and then return it else { distribution = GenerateDistributionForChar(key, classification); //If our dictionary does not have any information about this character, we return the character with probability 1.0 if (distribution.Count == 0) { distribution.Add("" + c.Value, 1.0); } _char_distributions_dict.Add(key, distribution); return distribution; } }
// returns the number of cells inspected public int Run(int nboots, // number of bootstraps string xlfile, // name of the workbook double significance, // significance threshold for test CutoffKind ck, // kind of threshold function to use Excel.Application app, // reference to Excel app Classification c, // data from which to generate errors Random r, // a random number generator AnalysisType analysisType, // the type of analysis to run bool weighted, // should we weigh things? bool all_outputs, // if !all_outputs, we only consider terminal outputs DAG dag, Excel.Workbook wb, AST.Address[] terminal_formula_cells, AST.Range[] terminal_input_vectors, CellDict original_inputs, CellDict correct_outputs, long max_duration_in_ms, String logfile, //filename for the output log ProgBar pb ) { //set wbname and path _wb_name = xlfile; _wb_path = wb.Path; _analysis_type = analysisType; _significance = significance; _all_outputs = all_outputs; _weighted = weighted; //Now we want to inject the errors from _errors Utility.InjectValues(app, wb, _errors); // save function outputs CellDict incorrect_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); //Time the removal of errors Stopwatch sw = new Stopwatch(); sw.Start(); // remove errors until none remain; MODIFIES WORKBOOK _user = SimulateUser(nboots, significance, ck, dag, original_inputs, _errors, correct_outputs, wb, app, analysisType, weighted, all_outputs, max_duration_in_ms, sw, logfile, pb); sw.Stop(); TimeSpan elapsed = sw.Elapsed; _analysis_time = elapsed.TotalSeconds; // save partially-corrected outputs var partially_corrected_outputs = Utility.SaveOutputs(terminal_formula_cells, dag); // compute total relative error _error = Utility.CalculateNormalizedError(correct_outputs, partially_corrected_outputs, _user.max_errors); _total_relative_error = Utility.TotalRelativeError(_error); // compute starting total relative error (normalized by max_errors) ErrorDict starting_error = Utility.CalculateNormalizedError(correct_outputs, incorrect_outputs, _user.max_errors); _initial_total_relative_error = Utility.TotalRelativeError(starting_error); // effort _max_effort = dag.allCells().Length; _effort = (_user.true_positives.Count + _user.false_positives.Count); _expended_effort = (double)_effort / (double)_max_effort; // compute average precision // AveP = (\sum_{k=1}^n (P(k) * rel(k))) / |total positives| // where P(k) is the precision at threshold k, // rel(k) = \{ 1 if item at k is a true positive, 0 otherwise _average_precision = _user.PrecRel_at_k.Sum() / (double)_errors.Count; // restore original values Utility.InjectValues(app, wb, original_inputs); _tree_construct_time = dag.AnalysisMilliseconds / 1000.0; // flag that we're done; safe to print output results _simulation_run = true; // return the number of cells inspected return(_effort); }
public string GenerateErrorString(string input, Classification c) { // get typo dict var td = c.GetTypoDict(); // get transposition dict var trd = c.GetTranspositionDict(); // convert the input into a char array var ochars = StringToOptCharArray(input); // add leading and trailing 'empty characters' var inputchars = AddLeadingTrailingSpace(ochars); // calculate the marginal probabilities of NOT making a typo for each char in input double[] PrsCharNotTypo = inputchars.Select(oc => { var key = new Tuple <OptChar, string>(oc, OptCharToString(oc)); int count; if (!td.TryGetValue(key, out count)) { count = 0; } // funny case to handle the fact that FSharpOption.None == null var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc)); int total = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value); if (total == 0) { return(1.0); } else { return((double)count / total); } }).ToArray(); // calculate the probability of making at least one error // might need log-probs here double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo); // calculate the marginal probabilities of NOT making a // transposition for each position in the input // note that we do NOT consider the empty strings here // For strings of length 1, the probability of not making a // transposition should be exactly 1. double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) => { int count; if (!trd.TryGetValue(0, out count)) { count = 0; } int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum(); if (total == 0) { return(1.0); } else { return((double)count / total); } }).ToArray() : new [] { 1.0 }; // calculate the probability of having at least one transposition double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans); // calculate the relative probability of making a typo vs a transposition double RelPrTypo = PrTypo / (PrTypo + PrTrans); // init with original input in case typos/transpositions prove to be impossible string output = input; // the while loop ensures that we do not return an unmodified string. // for most strings, returning an unmodified string is very unlikely do { // flip a coin to determine whether our guaranteed error is a typo or a transposition if (r.NextDouble() < RelPrTypo) { // is a typo // determine the index of the guaranteed typo double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray(); // if there are no possible typos then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars // we set the guaranteed transposition index to -1 to ensure that no // transpositions are guaranteed OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1)); // run typo algorithm (adjust i for leading space) output = Typoize(input_t, td, i); } else { // is a transposition // determine the index of the guaranteed transposition double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray(); // if there are no possible transpositions then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i)); // run typo algorithm; set guaranteed typo index to -1 to ensure that no // typo is guaranteed output = Typoize(input_t, td, -1); } } while (input == output); return(output); }
// Get dictionary of inputs and the error they produce public static CellDict GenImportantErrors(AST.Address[] output_nodes, CellDict inputs, int k, // number of alternatives to consider CellDict correct_outputs, Excel.Application app, Excel.Workbook wb, Classification c, DAG dag) { var eg = new ErrorGenerator(); var max_error_produced_dictionary = new Dictionary<AST.Address, Tuple<string, double>>(); foreach (KeyValuePair<AST.Address, string> pair in inputs) { AST.Address addr = pair.Key; string orig_value = pair.Value; //Load in the classification's dictionaries double max_error_produced = 0.0; string max_error_string = ""; // get k strings string[] errorstrings = eg.GenerateErrorStrings(orig_value, c, k); for (int i = 0; i < k; i++) { CellDict cd = new CellDict(); cd.Add(addr, errorstrings[i]); //inject the typo InjectValues(app, wb, cd); // save function outputs CellDict incorrect_outputs = SaveOutputs(output_nodes, dag); //remove the typo that was introduced cd.Clear(); cd.Add(addr, orig_value); InjectValues(app, wb, cd); double total_error = Utility.CalculateTotalError(correct_outputs, incorrect_outputs); //keep track of the largest observed max error if (total_error > max_error_produced) { max_error_produced = total_error; max_error_string = errorstrings[i]; } } //Add entry for this TreeNode in our dictionary with its max_error_produced max_error_produced_dictionary.Add(addr, new Tuple<string, double>(max_error_string, max_error_produced)); } // sort by max_error_produced var maxen = max_error_produced_dictionary.OrderByDescending(pair => pair.Value.Item2).Select(pair => new Tuple<AST.Address, string>(pair.Key, pair.Value.Item1)).ToList(); return maxen.Take((int)Math.Ceiling(0.05 * inputs.Count)).ToDictionary(tup => tup.Item1, tup => tup.Item2); }
public string[] GenerateErrorStrings(string orig, Classification c, int k) { var strs = new string[k]; for (int i = 0; i < k; i++) { strs[i] = GenerateErrorString(orig, c); } return strs; }