private Dictionary <int, double> GenerateTranspositionsDistribution(Classification classification) { var transposition_dict = classification.GetTranspositionDict(); var sum = transposition_dict.Select(pair => pair.Value).Sum(); var distribution = transposition_dict.Select(pair => new KeyValuePair <int, double>(pair.Key, (double)pair.Value / sum)); return(distribution.ToDictionary(pair => pair.Key, pair => pair.Value)); }
public string GenerateErrorString(string input, Classification c) { // get typo dict var td = c.GetTypoDict(); // get transposition dict var trd = c.GetTranspositionDict(); // convert the input into a char array var ochars = StringToOptCharArray(input); // add leading and trailing 'empty characters' var inputchars = AddLeadingTrailingSpace(ochars); // calculate the marginal probabilities of NOT making a typo for each char in input double[] PrsCharNotTypo = inputchars.Select(oc => { var key = new Tuple <OptChar, string>(oc, OptCharToString(oc)); int count; if (!td.TryGetValue(key, out count)) { count = 0; } // funny case to handle the fact that FSharpOption.None == null var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc)); int total = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value); if (total == 0) { return(1.0); } else { return((double)count / total); } }).ToArray(); // calculate the probability of making at least one error // might need log-probs here double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo); // calculate the marginal probabilities of NOT making a // transposition for each position in the input // note that we do NOT consider the empty strings here // For strings of length 1, the probability of not making a // transposition should be exactly 1. double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) => { int count; if (!trd.TryGetValue(0, out count)) { count = 0; } int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum(); if (total == 0) { return(1.0); } else { return((double)count / total); } }).ToArray() : new [] { 1.0 }; // calculate the probability of having at least one transposition double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans); // calculate the relative probability of making a typo vs a transposition double RelPrTypo = PrTypo / (PrTypo + PrTrans); // init with original input in case typos/transpositions prove to be impossible string output = input; // the while loop ensures that we do not return an unmodified string. // for most strings, returning an unmodified string is very unlikely do { // flip a coin to determine whether our guaranteed error is a typo or a transposition if (r.NextDouble() < RelPrTypo) { // is a typo // determine the index of the guaranteed typo double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray(); // if there are no possible typos then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars // we set the guaranteed transposition index to -1 to ensure that no // transpositions are guaranteed OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1)); // run typo algorithm (adjust i for leading space) output = Typoize(input_t, td, i); } else { // is a transposition // determine the index of the guaranteed transposition double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray(); // if there are no possible transpositions then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i)); // run typo algorithm; set guaranteed typo index to -1 to ensure that no // typo is guaranteed output = Typoize(input_t, td, -1); } } while (input == output); return(output); }
public string GenerateErrorString(string input, Classification c) { // get typo dict var td = c.GetTypoDict(); // get transposition dict var trd = c.GetTranspositionDict(); // convert the input into a char array var ochars = StringToOptCharArray(input); // add leading and trailing 'empty characters' var inputchars = AddLeadingTrailingSpace(ochars); // calculate the marginal probabilities of NOT making a typo for each char in input double[] PrsCharNotTypo = inputchars.Select(oc => { var key = new Tuple<OptChar, string>(oc, OptCharToString(oc)); int count; if (!td.TryGetValue(key, out count)) { count = 0; } // funny case to handle the fact that FSharpOption.None == null var cond_dist = td.Where(kvp => kvp.Key.Item1 == null ? oc == null : kvp.Key.Item1.Equals(oc)); int total = cond_dist.Aggregate(0, (acc, kvp) => acc + kvp.Value); if (total == 0) { return 1.0; } else { return (double)count / total; } }).ToArray(); // calculate the probability of making at least one error // might need log-probs here double PrTypo = 1.0 - PrsCharNotTypo.Aggregate(1.0, (acc, pr_not_typo) => acc * pr_not_typo); // calculate the marginal probabilities of NOT making a // transposition for each position in the input // note that we do NOT consider the empty strings here // For strings of length 1, the probability of not making a // transposition should be exactly 1. double[] PrsPosNotTrans = ochars.Length > 1 ? ochars.ToArray().Select((oc, idx) => { int count; if (!trd.TryGetValue(0, out count)) { count = 0; } int total = trd.Where(kvp => kvp.Key < input.Length - idx && kvp.Key >= -idx).Select(kvp => kvp.Value).Sum(); if (total == 0) { return 1.0; } else { return (double)count / total; } }).ToArray() : new [] { 1.0 }; // calculate the probability of having at least one transposition double PrTrans = 1.0 - PrsPosNotTrans.Aggregate(1.0, (acc, pr_not_trans) => acc * pr_not_trans); // calculate the relative probability of making a typo vs a transposition double RelPrTypo = PrTypo / (PrTypo + PrTrans); // init with original input in case typos/transpositions prove to be impossible string output = input; // the while loop ensures that we do not return an unmodified string. // for most strings, returning an unmodified string is very unlikely do { // flip a coin to determine whether our guaranteed error is a typo or a transposition if (r.NextDouble() < RelPrTypo) { // is a typo // determine the index of the guaranteed typo double[] PrsMistype = PrsCharNotTypo.Select(pr => 1.0 - pr).ToArray(); // if there are no possible typos then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars // we set the guaranteed transposition index to -1 to ensure that no // transpositions are guaranteed OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, -1)); // run typo algorithm (adjust i for leading space) output = Typoize(input_t, td, i); } else { // is a transposition // determine the index of the guaranteed transposition double[] PrsMistype = PrsPosNotTrans.Select(pr => 1.0 - pr).ToArray(); // if there are no possible transpositions then we just can't produce one if (PrsMistype.Sum() == 0) { break; } var i = MultinomialSample(PrsMistype); // run transposition algorithm & add leading/trailing empty chars OptChar[] input_t = AddLeadingTrailingSpace(Transposize(ochars, trd, i)); // run typo algorithm; set guaranteed typo index to -1 to ensure that no // typo is guaranteed output = Typoize(input_t, td, -1); } } while (input == output); return output; }
private Dictionary<int, double> GenerateTranspositionsDistribution(Classification classification) { var transposition_dict = classification.GetTranspositionDict(); var sum = transposition_dict.Select(pair => pair.Value).Sum(); var distribution = transposition_dict.Select(pair => new KeyValuePair<int, double>(pair.Key, (double)pair.Value / sum)); return distribution.ToDictionary(pair => pair.Key, pair => pair.Value); }