static OverallDataAccessor() { allClusters = AllResults.Load(ConfigurationManager.AppSettings["OverAllKnnGraphBin"]); allImages = allClusters.ImageEncoding.Reverse(); allPatches = allClusters.PatchEncoding.Reverse(); distanceLookup = allClusters.Rows.GroupBy(x => Tuple.Create <int, int>(x.Query.ImageId, x.Query.PatchId)).ToDictionary(x => x.Key, x => x.First()); }
private static void GraphComponentDecomposition(string path) { var loaded = AllResults.Load(path); var artificalResults = GroupIntoClusters(loaded); artificalResults.Save(path.Replace(".bin", "-patchClusters.bin")); using (var sw = new StreamWriter(path.Replace(".bin", "-patchLevelClustersReduced.html"), append: false)) { artificalResults.Render(sw); } }
private static void RenderData() { var protobufFolder = @"G:\siret\zoot\protobuf"; foreach (var file in Directory.GetFiles(protobufFolder, "*conv4*.bin")) { Console.WriteLine(file + " started"); var loadedFile = AllResults.Load(file); var newName = Filter(loadedFile, file); Print(newName, loadedFile); Console.WriteLine($"{file} is done now"); } }
private static void SparkMasketBasketParsing() { Console.WriteLine("Enter pattern of Spark .txt file(s)"); var lines = Directory.EnumerateFiles(@"G:\siret\spark-out\rules_conv_5-occur_5.0-conf_0.1", "part*") .SelectMany(File.ReadLines); var results = SparkResults.Parse(lines); var loadedNameMapping = AllResults.Load(@"G:\siret\zoot\protobuf\local-conv5-cleaned-shrinked.bin").ImageEncoding; using (var sw = new StreamWriter(@"G:\siret\spark-viz\market-basket-conv5-large-filtered.html")) { results.Print(sw, loadedNameMapping.ToDictionary(x => x.Value, x => x.Key), r => r.Input.Length < 4 && r.Input.Length > 1); } Console.WriteLine("Printed. Pres enter..."); Console.ReadLine(); }
private static void CreateSmallerBinsRoutine() { var wl = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"]); var wlie = wl.ImageEncoding.Reverse(); var wlpe = wl.PatchEncoding.Reverse(); var globalSet = OverallDataAccessor.allClusters; var interestingImagePatches = wl .Rows.SelectMany(r => r.Hits.Select(h => h.Hit).Concat(new[] { r.Query })) .Distinct() .Select(p => new Patch { ImageId = globalSet.ImageEncoding[wlie[p.ImageId]], PatchId = globalSet.PatchEncoding[wlpe[p.PatchId]] }) .ToLookup(x => x); var removed = globalSet.Rows.RemoveAll(rr => !interestingImagePatches.Contains(rr.Query)); globalSet.Save(ConfigurationManager.AppSettings["OverAllKnnGraphBin"].Replace(".bin", "-essential-knn.bin")); Console.WriteLine("After filtering {0} rows remaining, {1} was removed", globalSet.Rows.Count, removed); }
public static void Main(string[] args) { if (args.Length > 0 && args[0] == "run-offline-stats") { StatsCalculationRoutine(); return; } if (args.Length > 0 && args[0] == "csv-to-bin") { CsvToProtobuf.CreateProtobufFile(); return; } if (args.Length > 0 && args[0] == "bin-all-smaller") { CreateSmallerBinsRoutine(); return; } var zootLabels = ZootLabelProcessingTests.AllRecords; var zootLabelsByName = zootLabels.CreateIndex(x => new[] { x.id }).Unique(); var filteredClusters = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"]); var fromIdToName = filteredClusters.ImageEncoding.ToDictionary(x => x.Value, x => Path.GetFileNameWithoutExtension(x.Key).ToLower()); var patchIdToname = filteredClusters.PatchEncoding.Reverse(); var imageIdToFullName = filteredClusters.ImageEncoding.Reverse(); //filteredClusters.Rows = filteredClusters.Rows.Take(10).ToList(); Parallel.ForEach(filteredClusters.Rows, cluster => { Console.Write('.'); var involved = cluster.Hits.Select(x => x.Hit).ToList(); involved.Add(cluster.Query); var withDistancesAndLabels = (from i in involved.Distinct() let name = fromIdToName[i.ImageId] let Zoot = zootLabelsByName[name] let Distances = OverallDataAccessor.FindHitsInBigFile(imageIdToFullName[i.ImageId], patchIdToname[i.PatchId]) select new { Patch = i, Zoot, Distances }) .ToList(); var commonLabels = withDistancesAndLabels .SelectMany(x => x.Zoot.MeaningfulTextAttributes().Select(a => new { x.Patch, Attr = a })) .GroupBy(x => x.Attr, x => x.Patch) .OrderByDescending(g => g.Count()) .Where(g => g.Count() > 1 && g.Key != "zoot") .Take(20) .ToList(); var pairsOfLabels = from first in commonLabels from second in commonLabels.TakeWhile(x => x.Key != first.Key) from third in commonLabels.TakeWhile(x => x.Key != second.Key) let intersect = first.Intersect(second).Intersect(third).Count() where intersect > 10 select new { first, second, third, Name = first.Key + "^" + second.Key + "^" + third.Key, Size = intersect }; var allFoundMatches = withDistancesAndLabels .SelectMany(x => x.Distances) .GroupBy(x => x.Img) .Select(g => new { g.Key, MinDist = g.Min(i => i.Distance), ZootLabel = zootLabelsByName[OverallDataAccessor.GetCleanName(g.Key)] }) .OrderBy(x => x.MinDist) .Select(x => Tuple.Create(x.ZootLabel, x.MinDist)) .ToList(); cluster.Labels = (from cl in pairsOfLabels let labels = new[] { cl.first.Key, cl.second.Key, cl.third.Key } let corr = allFoundMatches.PointBiserialCorrelation(zl => zl.MeaningfulTextAttributes().Intersect(labels).Count() == labels.Length) orderby Math.Abs(corr) descending select new ClusterLabel { Correlation = corr, Label = cl.Name, Count = cl.Size }).ToArray(); Console.Write('!'); }); filteredClusters.Rows = filteredClusters.Rows.Where(x => x.Labels.Any()).OrderByDescending(x => x.Labels.Length == 0 ? 0 : x.Labels.Max(l => Math.Abs(l.Correlation))).ToList(); filteredClusters.Save(ConfigurationManager.AppSettings["FilteredPatchesBin"].Replace(".bin", "-with-labels-triples.bin")); var withLabels = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"].Replace(".bin", "-with-labels-triples.bin")); using (var sw = new StreamWriter(ConfigurationManager.AppSettings["FilteredPatchesBin"] .Replace(".bin", "-with-labels-triples.html"))) { withLabels.Render(sw); } }
private static void CalculateStats(string filename) { using (var file = new StreamWriter("filtering-statistics-selection.csv", append: true)) using (var sw = new CompositionWriter(new[] { file, Console.Out })) { AllResults loadedFile; void PrintStats(string stepName) { loadedFile.PrintStats(filename, stepName, sw); } var smallerFileName = filename.Replace(".bin", "-tresholdBasedCleaned.bin"); if (!File.Exists(smallerFileName)) { Console.WriteLine("Starting from scatch, no previous save point"); loadedFile = AllResults.Load(filename); PrintStats("Default-all"); loadedFile.Rows.RemoveAll(r => r.HasNearDuplicates()); GC.Collect(); PrintStats("Near-duplicate-candidates-removed"); loadedFile.Rows.RemoveAll(r => r.HasTooManyCloseMatches()); PrintStats("Too-large-candidates-removed"); loadedFile.Rows.RemoveAll(r => r.IsTooEquidistant()); PrintStats("Equidistant-candidates-removed"); loadedFile.Save(smallerFileName); } var combinations = new[] { new { File = "conv3-local.bin", Ratio = 0.91, Max = 400, Min = 10 }, new { File = "conv3-local.bin", Ratio = 0.88, Max = 400, Min = 8 }, new { File = "conv3-local.bin", Ratio = 0.96, Max = 50, Min = 10 }, new { File = "conv4-local.bin", Ratio = 0.91, Max = 800, Min = 12 }, new { File = "conv4-local.bin", Ratio = 0.89, Max = 800, Min = 8 }, new { File = "conv4-local.bin", Ratio = 0.94, Max = 400, Min = 12 }, new { File = "conv5-local.bin", Ratio = 0.8, Max = 800, Min = 8 }, new { File = "conv5-local.bin", Ratio = 0.76, Max = 50, Min = 6 }, new { File = "conv5-local.bin", Ratio = 0.92, Max = 200, Min = 8 }, }.ToLookup(x => x.File); // new[]{5,6,7,9,10,11,13,14,15} var bigFile = AllResults.Load(filename); Console.WriteLine(filename + " was big-loaded."); foreach (var c in combinations[filename]) { var ratio = c.Ratio; loadedFile = AllResults.Load(smallerFileName); loadedFile.Rows.ForEach(r => r.FilterNeigbhoursUsingDistanceDerivative(ratio)); loadedFile.RefreshReferenceMap(); loadedFile.RefBasedShrink(); loadedFile.RefreshReferenceMap(); for (int i = 1; i < 31; i++) { var removed = loadedFile.RefBasedShrink(); loadedFile.RefreshReferenceMap(); if (removed == 0) { Console.WriteLine($"Nothing removed at iteration {i}, stopping ref-based shrink for {ratio}"); break; } } // foreach (var maxImagesTreshold in new[]{25,50,100,200,400,800,1600}) // foreach (var minImagesTreshold in new[]{2,4,6,8,10,12}) var clustered = ClusterDecomposition.GroupIntoClusters(loadedFile, c.Max, c.Min); clustered.PrintStats(filename, $"After-clustering;{ratio};{c.Max};{c.Min}", sw); var clusterName = filename.Replace(".bin", $"deriv_{ratio}-max_{c.Max}-min_{c.Min}.bin"); clustered.Save(clusterName); Console.WriteLine(clusterName + " was saved."); using (var htmlw = new StreamWriter(clusterName.Replace("bin", ".html"))) { clustered.Render(htmlw); } Console.WriteLine(clusterName + " was rendered."); var wlie = clustered.ImageEncoding.Reverse(); var wlpe = clustered.PatchEncoding.Reverse(); var interestingImagePatches = clustered .Rows.SelectMany(r => r.Hits.Select(h => h.Hit).Concat(new[] { r.Query })) .Distinct() .Select(p => new Patch { ImageId = bigFile.ImageEncoding[wlie[p.ImageId]], PatchId = bigFile.PatchEncoding[wlpe[p.PatchId]] }) .ToLookup(x => x); var newBigFile = new AllResults { ImageEncoding = bigFile.ImageEncoding, PatchEncoding = bigFile.PatchEncoding, Rows = bigFile.Rows.Where(rr => interestingImagePatches.Contains(rr.Query)).ToList() }; Console.WriteLine(clusterName + " 's essential knn was shrinked."); newBigFile.Save(clusterName.Replace(".bin", "-essential-knn.bin")); Console.WriteLine("After filtering of {2} = {0} rows remaining, {1} was removed", newBigFile.Rows.Count, bigFile.Rows.Count - newBigFile.Rows.Count, clusterName); } } Console.WriteLine(filename + " Done"); }