예제 #1
0
        public static int ComputeEta(EtaOptions opts)
        {
            var states = opts.Strings.Select(ParseNullEmpty).Select(Synthesizer.StringToState).ToArray();
            var pairs  = states.Select((s, i) => new { s, i })
                         .SelectMany(x => states.Skip(x.i + 1), (x, y) => Tuple.Create(x.s, y));

            List <double> dissims = new List <double>();

            foreach (var pair in pairs)
            {
                var res = Similarity.ScoreStates(true, pair.Item1, pair.Item2);
                Console.WriteLine($"> {{ '{res.A}' , '{res.B}' }} => {res.Pattern}");
                Console.WriteLine($"> Pairwise Dissimilarity = {res.Cost}\n");
                dissims.Add(res.Cost);
            }

            if (opts.Strings.Count() > 2)
            {
                Console.WriteLine($"> Avg. Pairwise Dissimilarity = {dissims.Sum() / dissims.Count(),12:F5}");
                var prog = Synthesizer.Learn(1, states);
                Console.WriteLine($"> Best Overall Pattern = {prog.Description()}");
            }

            var vsa = Synthesizer.LearnAll(states);

            Console.WriteLine($"> Total Number of Consistent Patterns = {vsa.Size}.");

            if (opts.NumCandidates < 1)
            {
                return(0);
            }
            int sample_size = vsa.Size > opts.NumCandidates ? opts.NumCandidates : (int)vsa.Size;

            Console.WriteLine($"\n> {sample_size} Randomly Selected Patterns:");

            var rnd          = new Random();
            var sampledProgs = new SortedSet <ProgramNode>(
                Comparer <ProgramNode> .Create((p1, p2) => p1.Description().CompareTo(p2.Description()))
                );

            while (sampledProgs.Count() < sample_size)
            {
                sampledProgs.Add(vsa.Sample(rnd));
            }

            foreach (var p in sampledProgs)
            {
                Console.WriteLine($"  * {-p.GetFeatureValue(MText.Learner.Instance.ScoreFeature),12:F5} : {p.Description()}");
            }

            return(0);
        }
예제 #2
0
        public static int Estimate(SimilarityOptions opts)
        {
            Random rnd = new Random(0xf00d);

            // Do a learning call and just ignore the result.
            // To warm up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            var log_path = Path.Combine(Utils.Paths.LogsDir, "Similarity.FlashProfile.log");

            File.WriteAllText(log_path, "");

            int sim_total = 0, dis_total = 0;

            for (int i = 0; i < Utils.Paths.CleanDatasets.Length; ++i)
            {
                Console.Write($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... {(100.0 * i) / Utils.Paths.CleanDatasets.Length,5:F2} %");

                var           file_1  = Utils.Paths.CleanDatasets[i];
                List <string> inputs  = TestCase.LoadNonEmptyData(file_1);
                var           results = (from s1 in inputs.OrderBy(s => rnd.Next())
                                         from s2 in inputs.OrderBy(s => rnd.Next())
                                         where s1 != s2
                                         select ComputeScoreForStrings(true, s1, s2)
                                         ).Take(opts.SimCount);
                foreach (var res in results)
                {
                    File.AppendAllText(log_path, $"{res}\n");
                    sim_total++;
                }

                results = Utils.Paths.CleanDatasets.Where(file_2 => file_2 != file_1)
                          .Select(file_2 => TestCase.LoadNonEmptyData(file_2).OrderBy(s => rnd.Next()).Take(opts.DisCount))
                          .SelectMany(disContent => (from s1 in inputs.OrderBy(s => rnd.Next()).Take(opts.DisCount)
                                                     from s2 in disContent
                                                     select ComputeScoreForStrings(false, s1, s2)));
                foreach (var res in results)
                {
                    File.AppendAllText(log_path, $"{res}\n");
                    dis_total++;
                }
            }
            Console.WriteLine($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... 100 %");

            return(0);
        }
예제 #3
0
        public static SimData ScoreStates(bool ground_truth, State A, State B)
        {
            Stopwatch s = new Stopwatch();

            s.Start();
            ProgramNode program = Synthesizer.Learn(1, A, B);

            s.Stop();
            Synthesizer.Engine.ClearLearningCache();

            return(new SimData {
                A = Synthesizer.StateToString(A),
                B = Synthesizer.StateToString(B),
                GroundTruth = ground_truth,
                SynthesisTime = Convert.ToUInt32(s.ElapsedMilliseconds),
                Pattern = (program == null ? "<NULL>" : program.AcceptVisitor(new TokensCollector()).CombinedDescription()),
                Cost = Convert.ToSingle(program == null ? -DefaultTokens.Any.Score
                                                        : -program.GetFeatureValue(Learner.Instance.ScoreFeature)),
                Score = Convert.ToSingle(1.0 / (program == null ? Witnesses.ScoreTransform(-DefaultTokens.Any.Score)
                                                                : Witnesses.ScoreTransform(-program.GetFeatureValue(Learner.Instance.ScoreFeature))))
            });
        }
예제 #4
0
        public static int Estimate(ClusteringOptions opts)
        {
            Random rnd = new Random(0xface);

            // Do a learning call and just ignore the result.
            // To warm-up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            Stopwatch watch;

            Console.Write($"\n[+] Accuracy of recovering N ϵ [{opts.MinClusters},{opts.MaxClusters}] clusters NMI with (θ={opts.Theta},μ={opts.Mu}) @ {opts.NumStringsPerCluster} strings x {opts.TrialsPerClustering} trials ...");
            var separator = new string('=', 80);

            using (var file = File.CreateText(Path.Combine(Utils.Paths.LogsDir, $"NMI-{opts.Mu}x{opts.Theta}.log"))) {
                for (int clusters = opts.MinClusters; clusters <= opts.MaxClusters; ++clusters)
                {
                    Console.Write($"\nN = {clusters}:");
                    file.WriteLine($"\n\nN = {clusters} ... ");

                    var stats = new Dictionary <ahc_info, double> {
                        [ahc_info.NMI]  = 0,
                        [ahc_info.TIME] = 0
                    };

                    for (int i = 1; i <= opts.TrialsPerClustering; i++)
                    {
                        file.WriteLine($"\n\n{separator}");
                        file.Flush();

                        List <string> columns = Utils.Paths.CleanDatasets.Where(p => !p.EndsWith("aminos_cleaned.json"))
                                                .OrderBy(s => rnd.Next()).Take(clusters).ToList();
                        List <List <string> > data = columns.Select(f => TestCase.LoadNonEmptyData(f).Distinct().OrderBy(s => rnd.Next())
                                                                    .Take(opts.NumStringsPerCluster).ToList()).ToList();

                        file.WriteLine("Data:");
                        for (int j = data.Count - 1; j >= 0; j--)
                        {
                            file.WriteLine($"  [#] {columns[j]} = {string.Join("  .-.  ", data[j])}");
                        }
                        file.Flush();

                        watch = Stopwatch.StartNew();
                        var program = Learner.Instance.Learn(data.SelectMany(d => d.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false)))
                                                             .OrderBy(c => rnd.Next())
                                                             .Append(new DisjunctionsLimit <string, bool>((uint)clusters, (uint)clusters))
                                                             .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms))
                                                             .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta)));
                        watch.Stop();
                        Synthesizer.Engine.ClearLearningCache();

                        file.WriteLine($"\nProfile:\n  [$] {string.Join("\n  [$] ", program.Description())}");
                        List <List <string> > clustered_data = data.SelectMany(d => d).GroupBy(s => program.GetMatchingTokens(s))
                                                               .Select(g => g.ToList()).ToList();
                        double nmi = NormalizedMutualInfo(data, clustered_data);
                        stats[ahc_info.NMI] += nmi;
                        double time = watch.ElapsedMilliseconds;
                        stats[ahc_info.TIME] += time;

                        file.WriteLine("\nClusters");
                        foreach (var d in clustered_data)
                        {
                            file.WriteLine($"  [=]  {string.Join("  .-.  ", d)}");
                        }

                        file.WriteLine($"\n{nmi,4:F2} @ {time,5}ms");
                        Console.Write($"   {nmi,4:F2} ({Math.Round(time/1000.0, 0),3}s)");
                    }

                    file.WriteLine($"\n\nSum(Time) = {stats[ahc_info.TIME]}ms");
                    file.WriteLine($"Avg(Time) = {stats[ahc_info.TIME] / opts.TrialsPerClustering:F2}s");
                    file.WriteLine($"Avg(NMI) = {stats[ahc_info.NMI] / opts.TrialsPerClustering,4:F2}");
                    file.Flush();
                }
            }

            return(0);
        }
예제 #5
0
 public static SimData ComputeScoreForStrings(bool sim, string A, string B)
 => ScoreStates(sim, Synthesizer.StringToState(A), Synthesizer.StringToState(B));
예제 #6
0
        public static int Estimate(QualityOptions opts)
        {
            Random rnd = new Random(0xf00d);

            // Do a learning call and just ignore the result.
            // To warm-up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            var log_path = Path.Combine(Utils.Paths.Root, "logs", $"Quality.FlashProfile.{opts.Mu,4:F2}x{opts.Theta,4:F2}.{opts.ProfileFraction:F2}.log");

            File.WriteAllText(log_path, "");

            var datasets_dir = Path.Combine(Utils.Paths.Root, "logs", $"datasets.{opts.ProfileFraction}");

            if (!Directory.Exists(datasets_dir))
            {
                Directory.CreateDirectory(datasets_dir);
            }

            Console.WriteLine($"  {"FILENAME",50} => (+VE) - (-VE) =   Δ   | Avg Δ");
            Console.WriteLine(new string('-', 128));

            double final_result = 0, count = 0, precision = 0, recall = 0;

            for (int i = 0; i < Utils.Paths.DomainsDatasets.Length; ++i)
            {
                var file_path       = Utils.Paths.DomainsDatasets[i];
                var file_name       = Path.GetFileNameWithoutExtension(file_path);
                var short_file_path = file_path.Replace(Utils.Paths.Root, "");
                var inputs          = TestCase.LoadNonEmptyData(file_path).OrderBy(s => rnd.Next());

                Console.Write($"\r[+] ({i,2} / {Utils.Paths.DomainsDatasets.Length,2}) {short_file_path} ... ");

                int profile_data_size = Convert.ToInt32(inputs.Count() * opts.ProfileFraction);
                if (profile_data_size < 8)
                {
                    Console.WriteLine($"\r> {short_file_path,50} => ignore: dataset too small");
                    continue;
                }

                var profile_data = inputs.Take(profile_data_size).ToList();
                var constraints  = profile_data.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false))
                                   .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms))
                                   .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta));
                var program = Learner.Instance.Learn(constraints);

                var should_match_data         = inputs.Skip(profile_data_size);
                int should_mismatch_data_size = Math.Max(1, should_match_data.Count()
                                                         / (Utils.Paths.DomainsDatasets.Length - 1));
                var should_mismatch_data
                    = Utils.Paths.DomainsDatasets.Where(s => s != file_path)
                      .SelectMany(src => TestCase.LoadNonEmptyData(src).OrderBy(s => rnd.Next())
                                  .Take(should_mismatch_data_size)).ToList();
                should_match_data = should_match_data.Take(should_mismatch_data.Count);

                double mismatch = should_mismatch_data.Count(s => program?.Run(s) ?? false);
                double match    = should_match_data.Count(s => program?.Run(s) ?? false);

                precision += match / (match + mismatch);
                recall    += match / should_mismatch_data.Count;

                match    /= should_mismatch_data.Count;
                mismatch /= should_mismatch_data.Count;
                double result = match - mismatch;
                final_result += result;
                Console.WriteLine($"\r> {short_file_path,50} => {match:F3} - {mismatch:F3} = {result:F3} | {final_result / ++count:F3}");

                // Since `file_name` may be same for two files within test/*, we preprend i.
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.profiled"), string.Join("\n", profile_data));
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.match"), string.Join("\n", should_match_data));
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.mismatch"), string.Join("\n", should_mismatch_data));

                File.AppendAllText(log_path, $"> {short_file_path}\n\n");
                File.AppendAllText(log_path, $"  * Profiled Subset of Data:\n    {string.Join("\n    ", profile_data)}\n\n");
                File.AppendAllText(log_path, $"  * Default Profile:\n    {string.Join("\n    ", program?.Description() ?? Enumerable.Empty<string>())}\n\n");
                File.AppendAllText(log_path, $"  * Result:\n    + Match = {match}\n    + Mismatch = {mismatch}\n    + Score = {result}\n\n- - - - - - - -\n\n");
            }

            precision /= count;
            recall    /= count;
            double f1 = 2.0 * precision * recall / (precision + recall);

            File.AppendAllText(log_path, $"> Summary:");
            File.AppendAllText(log_path, $"\n  * Average Score = {final_result / count}");
            File.AppendAllText(log_path, $"\n  * Precision = {precision}");
            File.AppendAllText(log_path, $"\n  * Recall = {recall}");
            File.AppendAllText(log_path, $"\n  * F1 = {f1}");

            Console.WriteLine($"\n> Processed = {count}   :   Precision = {precision}  |  Recall = {recall}  |  F1 = {f1}");
            return(0);
        }