public static int ComputeEta(EtaOptions opts) { var states = opts.Strings.Select(ParseNullEmpty).Select(Synthesizer.StringToState).ToArray(); var pairs = states.Select((s, i) => new { s, i }) .SelectMany(x => states.Skip(x.i + 1), (x, y) => Tuple.Create(x.s, y)); List <double> dissims = new List <double>(); foreach (var pair in pairs) { var res = Similarity.ScoreStates(true, pair.Item1, pair.Item2); Console.WriteLine($"> {{ '{res.A}' , '{res.B}' }} => {res.Pattern}"); Console.WriteLine($"> Pairwise Dissimilarity = {res.Cost}\n"); dissims.Add(res.Cost); } if (opts.Strings.Count() > 2) { Console.WriteLine($"> Avg. Pairwise Dissimilarity = {dissims.Sum() / dissims.Count(),12:F5}"); var prog = Synthesizer.Learn(1, states); Console.WriteLine($"> Best Overall Pattern = {prog.Description()}"); } var vsa = Synthesizer.LearnAll(states); Console.WriteLine($"> Total Number of Consistent Patterns = {vsa.Size}."); if (opts.NumCandidates < 1) { return(0); } int sample_size = vsa.Size > opts.NumCandidates ? opts.NumCandidates : (int)vsa.Size; Console.WriteLine($"\n> {sample_size} Randomly Selected Patterns:"); var rnd = new Random(); var sampledProgs = new SortedSet <ProgramNode>( Comparer <ProgramNode> .Create((p1, p2) => p1.Description().CompareTo(p2.Description())) ); while (sampledProgs.Count() < sample_size) { sampledProgs.Add(vsa.Sample(rnd)); } foreach (var p in sampledProgs) { Console.WriteLine($" * {-p.GetFeatureValue(MText.Learner.Instance.ScoreFeature),12:F5} : {p.Description()}"); } return(0); }
public static int Estimate(SimilarityOptions opts) { Random rnd = new Random(0xf00d); // Do a learning call and just ignore the result. // To warm up PROSE. The first learning call always takes longer for some reason. Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb")); var log_path = Path.Combine(Utils.Paths.LogsDir, "Similarity.FlashProfile.log"); File.WriteAllText(log_path, ""); int sim_total = 0, dis_total = 0; for (int i = 0; i < Utils.Paths.CleanDatasets.Length; ++i) { Console.Write($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... {(100.0 * i) / Utils.Paths.CleanDatasets.Length,5:F2} %"); var file_1 = Utils.Paths.CleanDatasets[i]; List <string> inputs = TestCase.LoadNonEmptyData(file_1); var results = (from s1 in inputs.OrderBy(s => rnd.Next()) from s2 in inputs.OrderBy(s => rnd.Next()) where s1 != s2 select ComputeScoreForStrings(true, s1, s2) ).Take(opts.SimCount); foreach (var res in results) { File.AppendAllText(log_path, $"{res}\n"); sim_total++; } results = Utils.Paths.CleanDatasets.Where(file_2 => file_2 != file_1) .Select(file_2 => TestCase.LoadNonEmptyData(file_2).OrderBy(s => rnd.Next()).Take(opts.DisCount)) .SelectMany(disContent => (from s1 in inputs.OrderBy(s => rnd.Next()).Take(opts.DisCount) from s2 in disContent select ComputeScoreForStrings(false, s1, s2))); foreach (var res in results) { File.AppendAllText(log_path, $"{res}\n"); dis_total++; } } Console.WriteLine($"\r[+] Saving to {log_path}: [{sim_total,8} +ve, {dis_total,8} -ve] ... 100 %"); return(0); }
public static SimData ScoreStates(bool ground_truth, State A, State B) { Stopwatch s = new Stopwatch(); s.Start(); ProgramNode program = Synthesizer.Learn(1, A, B); s.Stop(); Synthesizer.Engine.ClearLearningCache(); return(new SimData { A = Synthesizer.StateToString(A), B = Synthesizer.StateToString(B), GroundTruth = ground_truth, SynthesisTime = Convert.ToUInt32(s.ElapsedMilliseconds), Pattern = (program == null ? "<NULL>" : program.AcceptVisitor(new TokensCollector()).CombinedDescription()), Cost = Convert.ToSingle(program == null ? -DefaultTokens.Any.Score : -program.GetFeatureValue(Learner.Instance.ScoreFeature)), Score = Convert.ToSingle(1.0 / (program == null ? Witnesses.ScoreTransform(-DefaultTokens.Any.Score) : Witnesses.ScoreTransform(-program.GetFeatureValue(Learner.Instance.ScoreFeature)))) }); }
public static int Estimate(ClusteringOptions opts) { Random rnd = new Random(0xface); // Do a learning call and just ignore the result. // To warm-up PROSE. The first learning call always takes longer for some reason. Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb")); Stopwatch watch; Console.Write($"\n[+] Accuracy of recovering N ϵ [{opts.MinClusters},{opts.MaxClusters}] clusters NMI with (θ={opts.Theta},μ={opts.Mu}) @ {opts.NumStringsPerCluster} strings x {opts.TrialsPerClustering} trials ..."); var separator = new string('=', 80); using (var file = File.CreateText(Path.Combine(Utils.Paths.LogsDir, $"NMI-{opts.Mu}x{opts.Theta}.log"))) { for (int clusters = opts.MinClusters; clusters <= opts.MaxClusters; ++clusters) { Console.Write($"\nN = {clusters}:"); file.WriteLine($"\n\nN = {clusters} ... "); var stats = new Dictionary <ahc_info, double> { [ahc_info.NMI] = 0, [ahc_info.TIME] = 0 }; for (int i = 1; i <= opts.TrialsPerClustering; i++) { file.WriteLine($"\n\n{separator}"); file.Flush(); List <string> columns = Utils.Paths.CleanDatasets.Where(p => !p.EndsWith("aminos_cleaned.json")) .OrderBy(s => rnd.Next()).Take(clusters).ToList(); List <List <string> > data = columns.Select(f => TestCase.LoadNonEmptyData(f).Distinct().OrderBy(s => rnd.Next()) .Take(opts.NumStringsPerCluster).ToList()).ToList(); file.WriteLine("Data:"); for (int j = data.Count - 1; j >= 0; j--) { file.WriteLine($" [#] {columns[j]} = {string.Join(" .-. ", data[j])}"); } file.Flush(); watch = Stopwatch.StartNew(); var program = Learner.Instance.Learn(data.SelectMany(d => d.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false))) .OrderBy(c => rnd.Next()) .Append(new DisjunctionsLimit <string, bool>((uint)clusters, (uint)clusters)) .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms)) .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta))); watch.Stop(); Synthesizer.Engine.ClearLearningCache(); file.WriteLine($"\nProfile:\n [$] {string.Join("\n [$] ", program.Description())}"); List <List <string> > clustered_data = data.SelectMany(d => d).GroupBy(s => program.GetMatchingTokens(s)) .Select(g => g.ToList()).ToList(); double nmi = NormalizedMutualInfo(data, clustered_data); stats[ahc_info.NMI] += nmi; double time = watch.ElapsedMilliseconds; stats[ahc_info.TIME] += time; file.WriteLine("\nClusters"); foreach (var d in clustered_data) { file.WriteLine($" [=] {string.Join(" .-. ", d)}"); } file.WriteLine($"\n{nmi,4:F2} @ {time,5}ms"); Console.Write($" {nmi,4:F2} ({Math.Round(time/1000.0, 0),3}s)"); } file.WriteLine($"\n\nSum(Time) = {stats[ahc_info.TIME]}ms"); file.WriteLine($"Avg(Time) = {stats[ahc_info.TIME] / opts.TrialsPerClustering:F2}s"); file.WriteLine($"Avg(NMI) = {stats[ahc_info.NMI] / opts.TrialsPerClustering,4:F2}"); file.Flush(); } } return(0); }
public static SimData ComputeScoreForStrings(bool sim, string A, string B) => ScoreStates(sim, Synthesizer.StringToState(A), Synthesizer.StringToState(B));
public static int Estimate(QualityOptions opts) { Random rnd = new Random(0xf00d); // Do a learning call and just ignore the result. // To warm-up PROSE. The first learning call always takes longer for some reason. Synthesizer.Learn(1, Synthesizer.StringToState(">)#*$&"), Synthesizer.StringToState("969dvb")); var log_path = Path.Combine(Utils.Paths.Root, "logs", $"Quality.FlashProfile.{opts.Mu,4:F2}x{opts.Theta,4:F2}.{opts.ProfileFraction:F2}.log"); File.WriteAllText(log_path, ""); var datasets_dir = Path.Combine(Utils.Paths.Root, "logs", $"datasets.{opts.ProfileFraction}"); if (!Directory.Exists(datasets_dir)) { Directory.CreateDirectory(datasets_dir); } Console.WriteLine($" {"FILENAME",50} => (+VE) - (-VE) = Δ | Avg Δ"); Console.WriteLine(new string('-', 128)); double final_result = 0, count = 0, precision = 0, recall = 0; for (int i = 0; i < Utils.Paths.DomainsDatasets.Length; ++i) { var file_path = Utils.Paths.DomainsDatasets[i]; var file_name = Path.GetFileNameWithoutExtension(file_path); var short_file_path = file_path.Replace(Utils.Paths.Root, ""); var inputs = TestCase.LoadNonEmptyData(file_path).OrderBy(s => rnd.Next()); Console.Write($"\r[+] ({i,2} / {Utils.Paths.DomainsDatasets.Length,2}) {short_file_path} ... "); int profile_data_size = Convert.ToInt32(inputs.Count() * opts.ProfileFraction); if (profile_data_size < 8) { Console.WriteLine($"\r> {short_file_path,50} => ignore: dataset too small"); continue; } var profile_data = inputs.Take(profile_data_size).ToList(); var constraints = profile_data.Select(s => Learner.Instance.BuildPositiveConstraint(s, true, false)) .Append(new AllowedTokens <string, bool>(Utils.Default.Atoms)) .Append(new ClusteringParameters <string, bool>(opts.Mu, opts.Theta)); var program = Learner.Instance.Learn(constraints); var should_match_data = inputs.Skip(profile_data_size); int should_mismatch_data_size = Math.Max(1, should_match_data.Count() / (Utils.Paths.DomainsDatasets.Length - 1)); var should_mismatch_data = Utils.Paths.DomainsDatasets.Where(s => s != file_path) .SelectMany(src => TestCase.LoadNonEmptyData(src).OrderBy(s => rnd.Next()) .Take(should_mismatch_data_size)).ToList(); should_match_data = should_match_data.Take(should_mismatch_data.Count); double mismatch = should_mismatch_data.Count(s => program?.Run(s) ?? false); double match = should_match_data.Count(s => program?.Run(s) ?? false); precision += match / (match + mismatch); recall += match / should_mismatch_data.Count; match /= should_mismatch_data.Count; mismatch /= should_mismatch_data.Count; double result = match - mismatch; final_result += result; Console.WriteLine($"\r> {short_file_path,50} => {match:F3} - {mismatch:F3} = {result:F3} | {final_result / ++count:F3}"); // Since `file_name` may be same for two files within test/*, we preprend i. File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.profiled"), string.Join("\n", profile_data)); File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.match"), string.Join("\n", should_match_data)); File.WriteAllText(Path.Combine(datasets_dir, $"{i}_{file_name}.mismatch"), string.Join("\n", should_mismatch_data)); File.AppendAllText(log_path, $"> {short_file_path}\n\n"); File.AppendAllText(log_path, $" * Profiled Subset of Data:\n {string.Join("\n ", profile_data)}\n\n"); File.AppendAllText(log_path, $" * Default Profile:\n {string.Join("\n ", program?.Description() ?? Enumerable.Empty<string>())}\n\n"); File.AppendAllText(log_path, $" * Result:\n + Match = {match}\n + Mismatch = {mismatch}\n + Score = {result}\n\n- - - - - - - -\n\n"); } precision /= count; recall /= count; double f1 = 2.0 * precision * recall / (precision + recall); File.AppendAllText(log_path, $"> Summary:"); File.AppendAllText(log_path, $"\n * Average Score = {final_result / count}"); File.AppendAllText(log_path, $"\n * Precision = {precision}"); File.AppendAllText(log_path, $"\n * Recall = {recall}"); File.AppendAllText(log_path, $"\n * F1 = {f1}"); Console.WriteLine($"\n> Processed = {count} : Precision = {precision} | Recall = {recall} | F1 = {f1}"); return(0); }