Exemplo n.º 1
0
        public static int Estimate(SimilarityOptions opts)
        {
            Random rnd = new Random(0xf00d);

            // Do a learning call and just ignore the result.
            // To warm up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            var log_path = Path.Combine(Utils.Paths.LogsDir, "Similarity.FlashProfile.log");

            File.WriteAllText(log_path, "");

            int sim_total = 0, dis_total = 0;

            for (int i = 0; i < Utils.Paths.CleanDatasets.Length; ++i)
            {
                Console.Write($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... {(100.0 * i) / Utils.Paths.CleanDatasets.Length,5:F2} %");

                var           file_1  = Utils.Paths.CleanDatasets[i];
                List <string> inputs  = TestCase.LoadNonEmptyData(file_1);
                var           results = (from s1 in inputs.OrderBy(s => rnd.Next())
                                         from s2 in inputs.OrderBy(s => rnd.Next())
                                         where s1 != s2
                                         select ComputeScoreForStrings(true, s1, s2)
                                         ).Take(opts.SimCount);
                foreach (var res in results)
                {
                    File.AppendAllText(log_path, $"{res}\n");
                    sim_total++;
                }

                results = Utils.Paths.CleanDatasets.Where(file_2 => file_2 != file_1)
                          .Select(file_2 => TestCase.LoadNonEmptyData(file_2).OrderBy(s => rnd.Next()).Take(opts.DisCount))
                          .SelectMany(disContent => (from s1 in inputs.OrderBy(s => rnd.Next()).Take(opts.DisCount)
                                                     from s2 in disContent
                                                     select ComputeScoreForStrings(false, s1, s2)));
                foreach (var res in results)
                {
                    File.AppendAllText(log_path, $"{res}\n");
                    dis_total++;
                }
            }
            Console.WriteLine($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... 100 %");

            return(0);
        }
Exemplo n.º 2
0
        public static int Estimate(ClusteringOptions opts)
        {
            Random rnd = new Random(0xface);

            // Do a learning call and just ignore the result.
            // To warm-up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            Stopwatch watch;

            Console.Write($"\n[+] Accuracy of recovering N ϵ [{opts.MinClusters},{opts.MaxClusters}] clusters NMI with (θ={opts.Theta},μ={opts.Mu}) @ {opts.NumStringsPerCluster} strings x {opts.TrialsPerClustering} trials ...");
            var separator = new string('=', 80);

            using (var file = File.CreateText(Path.Combine(Utils.Paths.LogsDir, $"NMI-{opts.Mu}x{opts.Theta}.log"))) {
                for (int clusters = opts.MinClusters; clusters <= opts.MaxClusters; ++clusters)
                {
                    Console.Write($"\nN = {clusters}:");
                    file.WriteLine($"\n\nN = {clusters} ... ");

                    var stats = new Dictionary <ahc_info, double> {
                        [ahc_info.NMI]  = 0,
                        [ahc_info.TIME] = 0
                    };

                    for (int i = 1; i <= opts.TrialsPerClustering; i++)
                    {
                        file.WriteLine($"\n\n{separator}");
                        file.Flush();

                        List <string> columns = Utils.Paths.CleanDatasets.Where(p => !p.EndsWith("aminos_cleaned.json"))
                                                .OrderBy(s => rnd.Next()).Take(clusters).ToList();
                        List <List <string> > data = columns.Select(f => TestCase.LoadNonEmptyData(f).Distinct().OrderBy(s => rnd.Next())
                                                                    .Take(opts.NumStringsPerCluster).ToList()).ToList();

                        file.WriteLine("Data:");
                        for (int j = data.Count - 1; j >= 0; j--)
                        {
                            file.WriteLine($"  [#] {columns[j]} = {string.Join("  .-.  ", data[j])}");
                        }
                        file.Flush();

                        watch = Stopwatch.StartNew();
                        var program = Learner.Instance.Learn(data.SelectMany(d => d.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false)))
                                                             .OrderBy(c => rnd.Next())
                                                             .Append(new DisjunctionsLimit <string, bool>((uint)clusters, (uint)clusters))
                                                             .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms))
                                                             .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta)));
                        watch.Stop();
                        Synthesizer.Engine.ClearLearningCache();

                        file.WriteLine($"\nProfile:\n  [$] {string.Join("\n  [$] ", program.Description())}");
                        List <List <string> > clustered_data = data.SelectMany(d => d).GroupBy(s => program.GetMatchingTokens(s))
                                                               .Select(g => g.ToList()).ToList();
                        double nmi = NormalizedMutualInfo(data, clustered_data);
                        stats[ahc_info.NMI] += nmi;
                        double time = watch.ElapsedMilliseconds;
                        stats[ahc_info.TIME] += time;

                        file.WriteLine("\nClusters");
                        foreach (var d in clustered_data)
                        {
                            file.WriteLine($"  [=]  {string.Join("  .-.  ", d)}");
                        }

                        file.WriteLine($"\n{nmi,4:F2} @ {time,5}ms");
                        Console.Write($"   {nmi,4:F2} ({Math.Round(time/1000.0, 0),3}s)");
                    }

                    file.WriteLine($"\n\nSum(Time) = {stats[ahc_info.TIME]}ms");
                    file.WriteLine($"Avg(Time) = {stats[ahc_info.TIME] / opts.TrialsPerClustering:F2}s");
                    file.WriteLine($"Avg(NMI) = {stats[ahc_info.NMI] / opts.TrialsPerClustering,4:F2}");
                    file.Flush();
                }
            }

            return(0);
        }
Exemplo n.º 3
0
 public static SimData ComputeScoreForStrings(bool sim, string A, string B)
 => ScoreStates(sim, Synthesizer.StringToState(A), Synthesizer.StringToState(B));
Exemplo n.º 4
0
        public static int Estimate(QualityOptions opts)
        {
            Random rnd = new Random(0xf00d);

            // Do a learning call and just ignore the result.
            // To warm-up PROSE. The first learning call always takes longer for some reason.
            Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb"));

            var log_path = Path.Combine(Utils.Paths.Root, "logs", $"Quality.FlashProfile.{opts.Mu,4:F2}x{opts.Theta,4:F2}.{opts.ProfileFraction:F2}.log");

            File.WriteAllText(log_path, "");

            var datasets_dir = Path.Combine(Utils.Paths.Root, "logs", $"datasets.{opts.ProfileFraction}");

            if (!Directory.Exists(datasets_dir))
            {
                Directory.CreateDirectory(datasets_dir);
            }

            Console.WriteLine($"  {"FILENAME",50} => (+VE) - (-VE) =   Δ   | Avg Δ");
            Console.WriteLine(new string('-', 128));

            double final_result = 0, count = 0, precision = 0, recall = 0;

            for (int i = 0; i < Utils.Paths.DomainsDatasets.Length; ++i)
            {
                var file_path       = Utils.Paths.DomainsDatasets[i];
                var file_name       = Path.GetFileNameWithoutExtension(file_path);
                var short_file_path = file_path.Replace(Utils.Paths.Root, "");
                var inputs          = TestCase.LoadNonEmptyData(file_path).OrderBy(s => rnd.Next());

                Console.Write($"\r[+] ({i,2} / {Utils.Paths.DomainsDatasets.Length,2}) {short_file_path} ... ");

                int profile_data_size = Convert.ToInt32(inputs.Count() * opts.ProfileFraction);
                if (profile_data_size < 8)
                {
                    Console.WriteLine($"\r> {short_file_path,50} => ignore: dataset too small");
                    continue;
                }

                var profile_data = inputs.Take(profile_data_size).ToList();
                var constraints  = profile_data.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false))
                                   .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms))
                                   .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta));
                var program = Learner.Instance.Learn(constraints);

                var should_match_data         = inputs.Skip(profile_data_size);
                int should_mismatch_data_size = Math.Max(1, should_match_data.Count()
                                                         / (Utils.Paths.DomainsDatasets.Length - 1));
                var should_mismatch_data
                    = Utils.Paths.DomainsDatasets.Where(s => s != file_path)
                      .SelectMany(src => TestCase.LoadNonEmptyData(src).OrderBy(s => rnd.Next())
                                  .Take(should_mismatch_data_size)).ToList();
                should_match_data = should_match_data.Take(should_mismatch_data.Count);

                double mismatch = should_mismatch_data.Count(s => program?.Run(s) ?? false);
                double match    = should_match_data.Count(s => program?.Run(s) ?? false);

                precision += match / (match + mismatch);
                recall    += match / should_mismatch_data.Count;

                match    /= should_mismatch_data.Count;
                mismatch /= should_mismatch_data.Count;
                double result = match - mismatch;
                final_result += result;
                Console.WriteLine($"\r> {short_file_path,50} => {match:F3} - {mismatch:F3} = {result:F3} | {final_result / ++count:F3}");

                // Since `file_name` may be same for two files within test/*, we preprend i.
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.profiled"), string.Join("\n", profile_data));
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.match"), string.Join("\n", should_match_data));
                File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.mismatch"), string.Join("\n", should_mismatch_data));

                File.AppendAllText(log_path, $"> {short_file_path}\n\n");
                File.AppendAllText(log_path, $"  * Profiled Subset of Data:\n    {string.Join("\n    ", profile_data)}\n\n");
                File.AppendAllText(log_path, $"  * Default Profile:\n    {string.Join("\n    ", program?.Description() ?? Enumerable.Empty<string>())}\n\n");
                File.AppendAllText(log_path, $"  * Result:\n    + Match = {match}\n    + Mismatch = {mismatch}\n    + Score = {result}\n\n- - - - - - - -\n\n");
            }

            precision /= count;
            recall    /= count;
            double f1 = 2.0 * precision * recall / (precision + recall);

            File.AppendAllText(log_path, $"> Summary:");
            File.AppendAllText(log_path, $"\n  * Average Score = {final_result / count}");
            File.AppendAllText(log_path, $"\n  * Precision = {precision}");
            File.AppendAllText(log_path, $"\n  * Recall = {recall}");
            File.AppendAllText(log_path, $"\n  * F1 = {f1}");

            Console.WriteLine($"\n> Processed = {count}   :   Precision = {precision}  |  Recall = {recall}  |  F1 = {f1}");
            return(0);
        }