Пример #1
0
 static OverallDataAccessor()
 {
     allClusters    = AllResults.Load(ConfigurationManager.AppSettings["OverAllKnnGraphBin"]);
     allImages      = allClusters.ImageEncoding.Reverse();
     allPatches     = allClusters.PatchEncoding.Reverse();
     distanceLookup = allClusters.Rows.GroupBy(x => Tuple.Create <int, int>(x.Query.ImageId, x.Query.PatchId)).ToDictionary(x => x.Key, x => x.First());
 }
Пример #2
0
        private static void GraphComponentDecomposition(string path)
        {
            var loaded = AllResults.Load(path);

            var artificalResults = GroupIntoClusters(loaded);

            artificalResults.Save(path.Replace(".bin", "-patchClusters.bin"));
            using (var sw = new StreamWriter(path.Replace(".bin", "-patchLevelClustersReduced.html"), append: false))
            {
                artificalResults.Render(sw);
            }
        }
Пример #3
0
        private static void RenderData()
        {
            var protobufFolder = @"G:\siret\zoot\protobuf";

            foreach (var file in Directory.GetFiles(protobufFolder, "*conv4*.bin"))
            {
                Console.WriteLine(file + " started");
                var loadedFile = AllResults.Load(file);

                var newName = Filter(loadedFile, file);
                Print(newName, loadedFile);

                Console.WriteLine($"{file} is done now");
            }
        }
Пример #4
0
        private static void SparkMasketBasketParsing()
        {
            Console.WriteLine("Enter pattern of Spark .txt file(s)");
            var lines = Directory.EnumerateFiles(@"G:\siret\spark-out\rules_conv_5-occur_5.0-conf_0.1", "part*")
                        .SelectMany(File.ReadLines);
            var results           = SparkResults.Parse(lines);
            var loadedNameMapping = AllResults.Load(@"G:\siret\zoot\protobuf\local-conv5-cleaned-shrinked.bin").ImageEncoding;

            using (var sw = new StreamWriter(@"G:\siret\spark-viz\market-basket-conv5-large-filtered.html"))
            {
                results.Print(sw, loadedNameMapping.ToDictionary(x => x.Value, x => x.Key),
                              r => r.Input.Length < 4 && r.Input.Length > 1);
            }

            Console.WriteLine("Printed. Pres enter...");
            Console.ReadLine();
        }
Пример #5
0
        private static void CreateSmallerBinsRoutine()
        {
            var wl        = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"]);
            var wlie      = wl.ImageEncoding.Reverse();
            var wlpe      = wl.PatchEncoding.Reverse();
            var globalSet = OverallDataAccessor.allClusters;
            var interestingImagePatches = wl
                                          .Rows.SelectMany(r => r.Hits.Select(h => h.Hit).Concat(new[] { r.Query }))
                                          .Distinct()
                                          .Select(p => new Patch
            {
                ImageId = globalSet.ImageEncoding[wlie[p.ImageId]], PatchId = globalSet.PatchEncoding[wlpe[p.PatchId]]
            })
                                          .ToLookup(x => x);

            var removed = globalSet.Rows.RemoveAll(rr => !interestingImagePatches.Contains(rr.Query));

            globalSet.Save(ConfigurationManager.AppSettings["OverAllKnnGraphBin"].Replace(".bin", "-essential-knn.bin"));

            Console.WriteLine("After filtering {0} rows remaining, {1} was removed", globalSet.Rows.Count, removed);
        }
Пример #6
0
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0] == "run-offline-stats")
            {
                StatsCalculationRoutine();
                return;
            }

            if (args.Length > 0 && args[0] == "csv-to-bin")
            {
                CsvToProtobuf.CreateProtobufFile();
                return;
            }

            if (args.Length > 0 && args[0] == "bin-all-smaller")
            {
                CreateSmallerBinsRoutine();
                return;
            }

            var zootLabels       = ZootLabelProcessingTests.AllRecords;
            var zootLabelsByName = zootLabels.CreateIndex(x => new[] { x.id }).Unique();

            var filteredClusters = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"]);


            var fromIdToName      = filteredClusters.ImageEncoding.ToDictionary(x => x.Value, x => Path.GetFileNameWithoutExtension(x.Key).ToLower());
            var patchIdToname     = filteredClusters.PatchEncoding.Reverse();
            var imageIdToFullName = filteredClusters.ImageEncoding.Reverse();

            //filteredClusters.Rows = filteredClusters.Rows.Take(10).ToList();

            Parallel.ForEach(filteredClusters.Rows, cluster =>
            {
                Console.Write('.');
                var involved = cluster.Hits.Select(x => x.Hit).ToList();
                involved.Add(cluster.Query);

                var withDistancesAndLabels =
                    (from i in involved.Distinct()
                     let name = fromIdToName[i.ImageId]
                                let Zoot = zootLabelsByName[name]
                                           let Distances = OverallDataAccessor.FindHitsInBigFile(imageIdToFullName[i.ImageId],
                                                                                                 patchIdToname[i.PatchId])
                                                           select new { Patch = i, Zoot, Distances })
                    .ToList();

                var commonLabels = withDistancesAndLabels
                                   .SelectMany(x => x.Zoot.MeaningfulTextAttributes().Select(a => new { x.Patch, Attr = a }))
                                   .GroupBy(x => x.Attr, x => x.Patch)
                                   .OrderByDescending(g => g.Count())
                                   .Where(g => g.Count() > 1 && g.Key != "zoot")
                                   .Take(20)
                                   .ToList();

                var pairsOfLabels =
                    from first in commonLabels
                    from second in commonLabels.TakeWhile(x => x.Key != first.Key)
                    from third in commonLabels.TakeWhile(x => x.Key != second.Key)
                    let intersect = first.Intersect(second).Intersect(third).Count()
                                    where intersect > 10
                                    select new { first, second, third, Name = first.Key + "^" + second.Key + "^" + third.Key, Size = intersect };


                var allFoundMatches = withDistancesAndLabels
                                      .SelectMany(x => x.Distances)
                                      .GroupBy(x => x.Img)
                                      .Select(g => new
                {
                    g.Key, MinDist = g.Min(i => i.Distance),
                    ZootLabel      = zootLabelsByName[OverallDataAccessor.GetCleanName(g.Key)]
                })
                                      .OrderBy(x => x.MinDist)
                                      .Select(x => Tuple.Create(x.ZootLabel, x.MinDist))
                                      .ToList();

                cluster.Labels =
                    (from cl in pairsOfLabels
                     let labels = new[] { cl.first.Key, cl.second.Key, cl.third.Key }
                     let corr = allFoundMatches.PointBiserialCorrelation(zl =>
                                                                         zl.MeaningfulTextAttributes().Intersect(labels).Count() == labels.Length)
                                orderby Math.Abs(corr) descending
                                select new ClusterLabel {
                    Correlation = corr, Label = cl.Name, Count = cl.Size
                }).ToArray();
                Console.Write('!');
            });

            filteredClusters.Rows = filteredClusters.Rows.Where(x => x.Labels.Any()).OrderByDescending(x => x.Labels.Length == 0 ? 0 : x.Labels.Max(l => Math.Abs(l.Correlation))).ToList();
            filteredClusters.Save(ConfigurationManager.AppSettings["FilteredPatchesBin"].Replace(".bin", "-with-labels-triples.bin"));


            var withLabels = AllResults.Load(ConfigurationManager.AppSettings["FilteredPatchesBin"].Replace(".bin", "-with-labels-triples.bin"));

            using (var sw = new StreamWriter(ConfigurationManager.AppSettings["FilteredPatchesBin"]
                                             .Replace(".bin", "-with-labels-triples.html")))
            {
                withLabels.Render(sw);
            }
        }
Пример #7
0
        private static void CalculateStats(string filename)
        {
            using (var file = new StreamWriter("filtering-statistics-selection.csv", append: true))
                using (var sw = new CompositionWriter(new[] { file, Console.Out }))
                {
                    AllResults loadedFile;
                    void PrintStats(string stepName)
                    {
                        loadedFile.PrintStats(filename, stepName, sw);
                    }

                    var smallerFileName = filename.Replace(".bin", "-tresholdBasedCleaned.bin");
                    if (!File.Exists(smallerFileName))
                    {
                        Console.WriteLine("Starting from scatch, no previous save point");
                        loadedFile = AllResults.Load(filename);
                        PrintStats("Default-all");

                        loadedFile.Rows.RemoveAll(r => r.HasNearDuplicates());
                        GC.Collect();
                        PrintStats("Near-duplicate-candidates-removed");

                        loadedFile.Rows.RemoveAll(r => r.HasTooManyCloseMatches());
                        PrintStats("Too-large-candidates-removed");

                        loadedFile.Rows.RemoveAll(r => r.IsTooEquidistant());
                        PrintStats("Equidistant-candidates-removed");

                        loadedFile.Save(smallerFileName);
                    }



                    var combinations = new[]
                    {
                        new { File = "conv3-local.bin", Ratio = 0.91, Max = 400, Min = 10 },
                        new { File = "conv3-local.bin", Ratio = 0.88, Max = 400, Min = 8 },
                        new { File = "conv3-local.bin", Ratio = 0.96, Max = 50, Min = 10 },

                        new { File = "conv4-local.bin", Ratio = 0.91, Max = 800, Min = 12 },
                        new { File = "conv4-local.bin", Ratio = 0.89, Max = 800, Min = 8 },
                        new { File = "conv4-local.bin", Ratio = 0.94, Max = 400, Min = 12 },

                        new { File = "conv5-local.bin", Ratio = 0.8, Max = 800, Min = 8 },
                        new { File = "conv5-local.bin", Ratio = 0.76, Max = 50, Min = 6 },
                        new { File = "conv5-local.bin", Ratio = 0.92, Max = 200, Min = 8 },
                    }.ToLookup(x => x.File);
                    // new[]{5,6,7,9,10,11,13,14,15}

                    var bigFile = AllResults.Load(filename);
                    Console.WriteLine(filename + " was big-loaded.");
                    foreach (var c in combinations[filename])
                    {
                        var ratio = c.Ratio;
                        loadedFile = AllResults.Load(smallerFileName);
                        loadedFile.Rows.ForEach(r => r.FilterNeigbhoursUsingDistanceDerivative(ratio));
                        loadedFile.RefreshReferenceMap();
                        loadedFile.RefBasedShrink();
                        loadedFile.RefreshReferenceMap();

                        for (int i = 1; i < 31; i++)
                        {
                            var removed = loadedFile.RefBasedShrink();
                            loadedFile.RefreshReferenceMap();
                            if (removed == 0)
                            {
                                Console.WriteLine($"Nothing removed at iteration {i}, stopping ref-based shrink for {ratio}");
                                break;
                            }
                        }

                        // foreach (var maxImagesTreshold in new[]{25,50,100,200,400,800,1600})
                        //  foreach (var minImagesTreshold in new[]{2,4,6,8,10,12})
                        var clustered = ClusterDecomposition.GroupIntoClusters(loadedFile, c.Max, c.Min);
                        clustered.PrintStats(filename, $"After-clustering;{ratio};{c.Max};{c.Min}", sw);
                        var clusterName = filename.Replace(".bin", $"deriv_{ratio}-max_{c.Max}-min_{c.Min}.bin");
                        clustered.Save(clusterName);
                        Console.WriteLine(clusterName + " was saved.");
                        using (var htmlw = new StreamWriter(clusterName.Replace("bin", ".html")))
                        {
                            clustered.Render(htmlw);
                        }
                        Console.WriteLine(clusterName + " was rendered.");

                        var wlie = clustered.ImageEncoding.Reverse();
                        var wlpe = clustered.PatchEncoding.Reverse();
                        var interestingImagePatches = clustered
                                                      .Rows.SelectMany(r => r.Hits.Select(h => h.Hit).Concat(new[] { r.Query }))
                                                      .Distinct()
                                                      .Select(p => new Patch {
                            ImageId = bigFile.ImageEncoding[wlie[p.ImageId]], PatchId = bigFile.PatchEncoding[wlpe[p.PatchId]]
                        })
                                                      .ToLookup(x => x);

                        var newBigFile = new AllResults
                        {
                            ImageEncoding = bigFile.ImageEncoding,
                            PatchEncoding = bigFile.PatchEncoding,
                            Rows          = bigFile.Rows.Where(rr => interestingImagePatches.Contains(rr.Query)).ToList()
                        };
                        Console.WriteLine(clusterName + " 's essential knn was shrinked.");
                        newBigFile.Save(clusterName.Replace(".bin", "-essential-knn.bin"));

                        Console.WriteLine("After filtering of {2} = {0} rows remaining, {1} was removed", newBigFile.Rows.Count, bigFile.Rows.Count - newBigFile.Rows.Count, clusterName);
                    }
                }

            Console.WriteLine(filename + " Done");
        }