Beispiel #1
0
        public void TestTypoGenerator()
        {
            var eg             = new ErrorGenerator();
            var classification = new Classification();

            //set typo dictionary to explicit one
            Dictionary <Tuple <OptChar, string>, int> typo_dict = new Dictionary <Tuple <OptChar, string>, int>();

            var key = new Tuple <OptChar, string>(OptChar.Some('t'), "y");

            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('t'), "t");
            typo_dict.Add(key, 0);

            key = new Tuple <OptChar, string>(OptChar.Some('T'), "TT");
            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('e'), "e");
            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('s'), "s");
            typo_dict.Add(key, 1);

            //The transpositions dictionary is empty so no transpositions should occur
            classification.SetTypoDict(typo_dict);
            var s = eg.GenerateErrorString("Testing", classification);

            Assert.AreEqual("TTesying", s);
        }
        public void TestTypoGenerator()
        {
            var eg = new ErrorGenerator();
            var classification = new Classification();

            //set typo dictionary to explicit one
            Dictionary<Tuple<OptChar, string>, int> typo_dict = new Dictionary<Tuple<OptChar, string>, int>();

            var key = new Tuple<OptChar, string>(OptChar.Some('t'), "y");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('t'), "t");
            typo_dict.Add(key, 0);

            key = new Tuple<OptChar, string>(OptChar.Some('T'), "TT");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('e'), "e");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('s'), "s");
            typo_dict.Add(key, 1);

            //The transpositions dictionary is empty so no transpositions should occur
            classification.SetTypoDict(typo_dict);
            var s = eg.GenerateErrorString("Testing", classification);
            Assert.AreEqual("TTesying", s);
        }
Beispiel #3
0
        public void TestDeserialize()
        {
            var classification = new Classification();
            //set typo dictionary to explicit one
            Dictionary <Tuple <OptChar, string>, int> typo_dict = new Dictionary <Tuple <OptChar, string>, int>();
            var key = new Tuple <OptChar, string>(OptChar.Some('t'), "y");

            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('t'), "t");
            typo_dict.Add(key, 0);

            key = new Tuple <OptChar, string>(OptChar.Some('T'), "TT");
            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('e'), "e");
            typo_dict.Add(key, 1);

            key = new Tuple <OptChar, string>(OptChar.Some('s'), "s");
            typo_dict.Add(key, 1);
            classification.SetTypoDict(typo_dict);
            var s = System.IO.Directory.GetParent(System.IO.Directory.GetCurrentDirectory());
            var v = System.IO.Directory.GetParent(s.FullName).FullName;

            System.IO.Directory.CreateDirectory(v + "\\GeneratedFiles");
            var full_path = v + "\\GeneratedFiles\\testfile.bin";

            classification.Serialize(full_path);

            Classification c2          = Classification.Deserialize(full_path);
            var            typo_dict_2 = c2.GetTypoDict();

            Assert.AreEqual(typo_dict_2.Count, typo_dict.Count);
        }
        public void TestDeserialize()
        {
            var classification = new Classification();
            //set typo dictionary to explicit one
            Dictionary<Tuple<OptChar, string>, int> typo_dict = new Dictionary<Tuple<OptChar, string>, int>();
            var key = new Tuple<OptChar, string>(OptChar.Some('t'), "y");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('t'), "t");
            typo_dict.Add(key, 0);

            key = new Tuple<OptChar, string>(OptChar.Some('T'), "TT");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('e'), "e");
            typo_dict.Add(key, 1);

            key = new Tuple<OptChar, string>(OptChar.Some('s'), "s");
            typo_dict.Add(key, 1);
            classification.SetTypoDict(typo_dict);
            var s = System.IO.Directory.GetParent(System.IO.Directory.GetCurrentDirectory());
            var v = System.IO.Directory.GetParent(s.FullName).FullName;
            System.IO.Directory.CreateDirectory(v + "\\GeneratedFiles");
            var full_path = v + "\\GeneratedFiles\\testfile.bin";
            classification.Serialize(full_path);

            Classification c2 = Classification.Deserialize(full_path);
            var typo_dict_2 = c2.GetTypoDict();
            Assert.AreEqual(typo_dict_2.Count, typo_dict.Count);
        }
Beispiel #5
0
        public void TestTranspositionGenerator()
        {
            var eg             = new ErrorGenerator();
            var classification = new Classification();

            //set typo dictionary to explicit one -- it's empty so no typos are possible
            Dictionary <Tuple <OptChar, string>, int> typo_dict = new Dictionary <Tuple <OptChar, string>, int>();

            //Set the transpositions dictionary to explicit one
            Dictionary <int, int> transpositions_dict = new Dictionary <int, int>();

            transpositions_dict.Add(3, 10);
            //transpositions_dict.Add(0, 1);

            classification.SetTranspositionDict(transpositions_dict);
            classification.SetTypoDict(typo_dict);
            string s = eg.GenerateErrorString("abcd", classification);

            Assert.AreEqual("dbca", s);

            //NOTE: Need a new ErrorGenerator for each test because the distribution tables are associated with it
            var eg2 = new ErrorGenerator();
            //Set the transpositions dictionary to explicit one
            var transpositions_dict2 = new Dictionary <int, int>();

            transpositions_dict2.Add(1, 10);
            //transpositions_dict2.Add(0, 1);
            Classification classification2 = new Classification();

            classification2.SetTranspositionDict(transpositions_dict2);
            classification2.SetTypoDict(typo_dict);
            string s2 = eg2.GenerateErrorString("abcd", classification2);

            Assert.AreEqual("bcda", s2);

            var eg3 = new ErrorGenerator();
            //Set the transpositions dictionary to explicit one
            var transpositions_dict3 = new Dictionary <int, int>();

            transpositions_dict3.Add(10, 10);
            transpositions_dict3.Add(-10, 10);
            transpositions_dict3.Add(0, 1);
            Classification classification3 = new Classification();

            classification3.SetTranspositionDict(transpositions_dict3);
            classification3.SetTypoDict(typo_dict);
            string s3 = eg3.GenerateErrorString("abcd", classification3);

            Assert.AreEqual("abcd", s3);
            // TODO: this test originally checked that a certain class of error
            // was impossible; what kind of error?
            //Assert.AreEqual(0, result3.Item2.Count);
        }
        public void TestSerialize()
        {
            var classification = new Classification();
            var s = System.IO.Directory.GetParent(System.IO.Directory.GetCurrentDirectory());
            var v = System.IO.Directory.GetParent(s.FullName).FullName;
            System.IO.Directory.CreateDirectory(v + "\\GeneratedFiles");
            var full_path = v + "\\GeneratedFiles\\testfile_foo.bin";
            classification.Serialize(full_path);

            var t = System.IO.File.OpenRead(full_path);
            t.Close();
        }
Beispiel #7
0
        public void TestSerialize()
        {
            var classification = new Classification();
            var s = System.IO.Directory.GetParent(System.IO.Directory.GetCurrentDirectory());
            var v = System.IO.Directory.GetParent(s.FullName).FullName;

            System.IO.Directory.CreateDirectory(v + "\\GeneratedFiles");
            var full_path = v + "\\GeneratedFiles\\testfile_foo.bin";

            classification.Serialize(full_path);

            var t = System.IO.File.OpenRead(full_path);

            t.Close();
        }
        public void TestTranspositionGenerator()
        {
            var eg = new ErrorGenerator();
            var classification = new Classification();

            //set typo dictionary to explicit one -- it's empty so no typos are possible
            Dictionary<Tuple<OptChar, string>, int> typo_dict = new Dictionary<Tuple<OptChar, string>, int>();

            //Set the transpositions dictionary to explicit one
            Dictionary<int, int> transpositions_dict = new Dictionary<int, int>();
            transpositions_dict.Add(3, 10);
            //transpositions_dict.Add(0, 1);

            classification.SetTranspositionDict(transpositions_dict);
            classification.SetTypoDict(typo_dict);
            string s = eg.GenerateErrorString("abcd", classification);
            Assert.AreEqual("dbca", s);

            //NOTE: Need a new ErrorGenerator for each test because the distribution tables are associated with it
            var eg2 = new ErrorGenerator();
            //Set the transpositions dictionary to explicit one
            var transpositions_dict2 = new Dictionary<int, int>();
            transpositions_dict2.Add(1, 10);
            //transpositions_dict2.Add(0, 1);
            Classification classification2 = new Classification();
            classification2.SetTranspositionDict(transpositions_dict2);
            classification2.SetTypoDict(typo_dict);
            string s2 = eg2.GenerateErrorString("abcd", classification2);
            Assert.AreEqual("bcda", s2);

            var eg3 = new ErrorGenerator();
            //Set the transpositions dictionary to explicit one
            var transpositions_dict3 = new Dictionary<int, int>();
            transpositions_dict3.Add(10, 10);
            transpositions_dict3.Add(-10, 10);
            transpositions_dict3.Add(0, 1);
            Classification classification3 = new Classification();
            classification3.SetTranspositionDict(transpositions_dict3);
            classification3.SetTypoDict(typo_dict);
            string s3 = eg3.GenerateErrorString("abcd", classification3);
            Assert.AreEqual("abcd", s3);
            // TODO: this test originally checked that a certain class of error
            // was impossible; what kind of error?
            //Assert.AreEqual(0, result3.Item2.Count);
        }
Beispiel #9
0
        public void TypoClassify()
        {
            int    NUMTRIALS = 100000;
            double EPSILON   = 0.00001;
            var    original  = "Testing";

            string[] errors = { "Tesying", "eTTsting", "Tessting" };

            // training the model with 5% erroneous strings
            var c                = new UserSimulation.Classification();
            var rnd              = new Random();
            var p_correct        = 0.95;
            int total_char_count = 0;
            int bad_char_count   = 0;

            for (int i = 0; i < NUMTRIALS; i++)
            {
                var j = rnd.NextDouble();
                if (j <= p_correct)
                {
                    var outcome = c.ProcessTypos(original, original);
                    Assert.AreEqual(outcome.Item2, 0);
                    bad_char_count   += outcome.Item2;
                    total_char_count += original.Length + 1;
                }
                else
                {
                    var entered = errors[rnd.Next(3)];
                    var outcome = c.ProcessTypos(original, entered);
                    Assert.AreNotEqual(outcome.Item2, 0);
                    bad_char_count   += outcome.Item2;
                    total_char_count += original.Length + 1;
                }
            }

            // the per-character error rate should be at most the following:
            double shouldbe_p_incorrect = (double)bad_char_count / (double)total_char_count + EPSILON;
            double observed_p_incorrect = c.CharErrorRate();

            Assert.AreEqual(true, observed_p_incorrect <= shouldbe_p_incorrect);
        }
Beispiel #10
0
        public static bool RunSubletyExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // init error generator
            var eg = new ErrorGenerator();

            // get inputs as an array of addresses to facilitate random selection
            // DATA INPUTS ONLY
            AST.Address[] inputs = prepdata.dag.terminalInputCells();

            for (int i = 0; i < 100; i++)
            {
                // randomly choose a *numeric* input
                // TODO: use Fischer-Yates and take values until
                // either we have a satisfactory input value or none
                // remain
                var         rnd_addrs = inputs.Shuffle().ToList();
                bool        num_found = false;
                String      input_string;
                double      input_value;
                AST.Address rand_addr;
                do
                {
                    // randomly choose an address; if there are none left, fail
                    if (rnd_addrs.Count == 0)
                    {
                        return(false);
                    }
                    rand_addr = rnd_addrs.First();
                    rnd_addrs = rnd_addrs.Skip(1).ToList();

                    // get the value
                    input_string = prepdata.original_inputs[rand_addr];

                    // try parsing it
                    if (Double.TryParse(input_string, out input_value))
                    {
                        num_found = true;
                    }
                } while (!num_found);

                // perturb it
                String erroneous_input = eg.GenerateSubtleErrorString(input_value, c);

                // create an error dictionary with this one perturbed value
                var errors = new CellDict();
                errors.Add(rand_addr, erroneous_input);

                // run simulations; simulation code does insertion of errors and restore of originals
                RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
            }

            return(true);
        }
Beispiel #11
0
        public static void RunProportionExperiment(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // init error generator
            var eg = new ErrorGenerator();

            // get inputs as an array of addresses to facilitate random selection
            // DATA INPUTS ONLY
            AST.Address[] inputs = prepdata.dag.terminalInputCells();

            // sanity check: all of the inputs should also be in prepdata.original_inputs
            foreach (AST.Address addr in inputs)
            {
                if (!prepdata.original_inputs.ContainsKey(addr))
                {
                    throw new Exception("Missing address!");
                }
            }

            for (int i = 0; i < 100; i++)
            {
                // randomly choose an input address
                AST.Address rand_addr = inputs[r.Next(inputs.Length)];

                // get the value
                String input_value = prepdata.original_inputs[rand_addr];

                // perturb it
                String erroneous_input = eg.GenerateErrorString(input_value, c);

                // create an error dictionary with this one perturbed value
                var errors = new CellDict();
                errors.Add(rand_addr, erroneous_input);

                // run simulations; simulation code does insertion of errors and restore of originals
                RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
            }
        }
Beispiel #12
0
        public static void RunSimulationPaperMain(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, bool ignore_parse_errors)
        {
            pb.setMax(5);

            // record intitial state of spreadsheet
            var prepdata = Prep.PrepSimulation(app, wbh, pb, ignore_parse_errors);

            // generate errors
            CellDict errors = UserSimulation.Utility.GenImportantErrors(prepdata.terminal_formula_nodes,
                                                                        prepdata.original_inputs,
                                                                        5,
                                                                        prepdata.correct_outputs,
                                                                        app,
                                                                        wbh,
                                                                        c,
                                                                        prepdata.dag);

            // run paper simulations
            RunSimulation(app, wbh, nboots, significance, threshold, c, r, outfile, max_duration_in_ms, logfile, pb, prepdata, errors);
        }
Beispiel #13
0
        public static void RunSimulation(Excel.Application app, Excel.Workbook wbh, int nboots, double significance, double threshold, UserSimulation.Classification c, Random r, String outfile, long max_duration_in_ms, String logfile, ProgBar pb, PrepData prepdata, CellDict errors)
        {
            // write header if needed
            if (!System.IO.File.Exists(outfile))
            {
                System.IO.File.AppendAllText(outfile, Simulation.HeaderRowForCSV());
            }

            // CheckCell weighted, all outputs, quantile
            //var s_1 = new UserSimulation.Simulation();
            //s_1.RunFromBatch(nboots,                                   // number of bootstraps
            //                    wbh.FullName,                          // Excel filename
            //                    significance,                          // statistical significance threshold for hypothesis test
            //                    app,                                   // Excel.Application
            //                    new QuantileCutoff(0.05),              // max % extreme values to flag
            //                    c,                                     // classification data
            //                    r,                                     // random number generator
            //                    UserSimulation.AnalysisType.CheckCell5,// analysis type
            //                    true,                                  // weighted analysis
            //                    true,                                  // use all outputs for analysis
            //                    prepdata.graph,                                 // AnalysisData
            //                    wbh,                                   // Excel.Workbook
            //                    errors,                                // pre-generated errors
            //                    prepdata.terminal_input_nodes,                  // input range nodes
            //                    prepdata.terminal_formula_nodes,                // output nodes
            //                    prepdata.original_inputs,                       // original input values
            //                    prepdata.correct_outputs,                       // original output values
            //                    max_duration_in_ms,                    // max duration of simulation
            //                    logfile);
            //System.IO.File.AppendAllText(outfile, s_1.FormatResultsAsCSV());
            pb.IncrementProgress();

            // CheckCell weighted, all outputs, quantile
            var s_4 = new UserSimulation.Simulation();

            s_4.RunFromBatch(nboots,                                   // number of bootstraps
                             wbh.FullName,                             // Excel filename
                             significance,                             // statistical significance of threshold
                             app,                                      // Excel.Application
                             new QuantileCutoff(0.10),                 // max % extreme values to flag
                             c,                                        // classification data
                             r,                                        // random number generator
                             UserSimulation.AnalysisType.CheckCell10,  // analysis type
                             true,                                     // weighted analysis
                             true,                                     // use all outputs for analysis
                             prepdata.dag,                             // AnalysisData
                             wbh,                                      // Excel.Workbook
                             errors,                                   // pre-generated errors
                             prepdata.terminal_input_nodes,            // input range nodes
                             prepdata.terminal_formula_nodes,          // output nodes
                             prepdata.original_inputs,                 // original input values
                             prepdata.correct_outputs,                 // original output values
                             max_duration_in_ms,                       // max duration of simulation
                             logfile);
            System.IO.File.AppendAllText(outfile, s_4.FormatResultsAsCSV());
            pb.IncrementProgress();

            // Normal, all inputs
            var s_2 = new UserSimulation.Simulation();

            s_2.RunFromBatch(nboots,                                      // irrelevant
                             wbh.FullName,                                // Excel filename
                             significance,                                // normal cutoff?
                             app,                                         // Excel.Application
                             new NormalCutoff(threshold),                 // ??
                             c,                                           // classification data
                             r,                                           // random number generator
                             UserSimulation.AnalysisType.NormalAllInputs, // analysis type
                             true,                                        // irrelevant
                             true,                                        // irrelevant
                             prepdata.dag,                                // AnalysisData
                             wbh,                                         // Excel.Workbook
                             errors,                                      // pre-generated errors
                             prepdata.terminal_input_nodes,               // input range nodes
                             prepdata.terminal_formula_nodes,             // output nodes
                             prepdata.original_inputs,                    // original input values
                             prepdata.correct_outputs,                    // original output values
                             max_duration_in_ms,                          // max duration of simulation
                             logfile);
            System.IO.File.AppendAllText(outfile, s_2.FormatResultsAsCSV());
            pb.IncrementProgress();

            // Normal, range inputs
            //var s_3 = new UserSimulation.Simulation();
            //s_3.RunFromBatch(nboots,                                   // irrelevant
            //                    wbh.FullName,                              // Excel filename
            //                    significance,                          // normal cutoff?
            //                    app,                                   // Excel.Application
            //                    new NormalCutoff(threshold),           // ??
            //                    c,                                     // classification data
            //                    r,                                     // random number generator
            //                    UserSimulation.AnalysisType.NormalPerRange,   // analysis type
            //                    true,                                  // irrelevant
            //                    true,                                  // irrelevant
            //                    prepdata.graph,                                 // AnalysisData
            //                    wbh,                                   // Excel.Workbook
            //                    errors,                                // pre-generated errors
            //                    prepdata.terminal_input_nodes,                  // input range nodes
            //                    prepdata.terminal_formula_nodes,                // output nodes
            //                    prepdata.original_inputs,                       // original input values
            //                    prepdata.correct_outputs,                       // original output values
            //                    max_duration_in_ms,                    // max duration of simulation
            //                    logfile);
            //System.IO.File.AppendAllText(outfile, s_3.FormatResultsAsCSV());
            pb.IncrementProgress();
        }
Beispiel #14
0
        public void TypoClassify()
        {
            int NUMTRIALS = 100000;
            double EPSILON = 0.00001;
            var original = "Testing";
            string[] errors = { "Tesying", "eTTsting", "Tessting" };

            // training the model with 5% erroneous strings
            var c = new UserSimulation.Classification();
            var rnd = new Random();
            var p_correct = 0.95;
            int total_char_count = 0;
            int bad_char_count = 0;
            for (int i = 0; i < NUMTRIALS; i++)
            {
                var j = rnd.NextDouble();
                if (j <= p_correct)
                {
                    var outcome = c.ProcessTypos(original, original);
                    Assert.AreEqual(outcome.Item2, 0);
                    bad_char_count += outcome.Item2;
                    total_char_count += original.Length + 1;
                }
                else
                {
                    var entered = errors[rnd.Next(3)];
                    var outcome = c.ProcessTypos(original, entered);
                    Assert.AreNotEqual(outcome.Item2, 0);
                    bad_char_count += outcome.Item2;
                    total_char_count += original.Length + 1;
                }
            }

            // the per-character error rate should be at most the following:
            double shouldbe_p_incorrect = (double)bad_char_count / (double)total_char_count + EPSILON;
            double observed_p_incorrect = c.CharErrorRate();
            Assert.AreEqual(true, observed_p_incorrect <= shouldbe_p_incorrect);
        }
Beispiel #15
0
        private static void RunSubletyExperiment(Excel.Application app, Excel.Workbook wb, Random rng, UserSimulation.Classification c, string output_dir, double thresh, ProgBar pb)
        {
            // number of bootstraps
            var NBOOTS = 2700;

            // the full path of this workbook
            var filename = app.ActiveWorkbook.Name;

            // the default output filename
            var r = new System.Text.RegularExpressions.Regex(@"(.+)\.xls|xlsx", System.Text.RegularExpressions.RegexOptions.Compiled);
            var default_output_file = "simulation_results.csv";
            var default_log_file    = r.Match(filename).Groups[1].Value + ".iterlog.csv";

            // save file location (will append for additional runs)
            var savefile = System.IO.Path.Combine(output_dir, default_output_file);

            // log file location (new file for each new workbook)
            var logfile = System.IO.Path.Combine(output_dir, default_log_file);

            // disable screen updating
            app.ScreenUpdating = false;

            // run simulations
            if (!UserSimulation.Config.RunSubletyExperiment(app, wb, NBOOTS, 0.95, thresh, c, rng, savefile, MAX_DURATION_IN_MS, logfile, pb, IGNORE_PARSE_ERRORS))
            {
                System.Windows.Forms.MessageBox.Show("This spreadsheet contains no numeric inputs.");
            }

            // enable screen updating
            app.ScreenUpdating = true;
        }