public void Prepare(IEnumerable <T> instances, float minBrightness = 0.6F, float maxBrightness = 0.9F)
        {
            ColorSelectors = new Dictionary <T, circularSelector <String> >();

            frequencyCounter <T> counter = new frequencyCounter <T>();

            foreach (var inst in instances)
            {
                counter.Count(inst);
            }
            var types = counter.GetDistinctItems();

            var rootColors = rootGradient.GetColorHSVSteps(types.Count); //GetColorSteps(types.Count);

            foreach (var type in types)
            {
                Int32 ti        = types.IndexOf(type);
                var   rootColor = rootColors[ti];

                var A = rootColor.Clone();
                A.V = minBrightness;
                var B = rootColor.Clone();
                B.V = maxBrightness;

                RootColors.Add(type, A.GetHexColor(true));


                ColorGradient typeGradient = new ColorGradient(A.GetHexColor(true), B.GetHexColor(true), ColorGradientFunction.AllAToB);

                ColorSelectors.Add(type, new circularSelector <string>(typeGradient.GetColorSteps(counter.GetFrequencyForItem(type))));
            }
        }
        /// <summary>
        /// Returns common finger print
        /// </summary>
        /// <param name="input">The input.</param>
        /// <returns></returns>
        public static StructureFingerPrint CommonFingerPrint(IEnumerable <StructureFingerPrint> input)
        {
            StructureFingerPrint output = new StructureFingerPrint();

            frequencyCounter <String> XPathFrequencyCounter = new frequencyCounter <string>();

            Int32 c = 0;

            foreach (StructureFingerPrint print in input)
            {
                foreach (String xpath in print.XPathList)
                {
                    XPathFrequencyCounter.Count(xpath);
                }
                c++;
            }

            var bins = XPathFrequencyCounter.GetFrequencyBins();

            if (bins.ContainsKey(c))
            {
                foreach (String xpath in bins[c])
                {
                    output.XPathList.Add(xpath);
                }
            }
            else
            {
                return(null);
            }

            return(output);
        }
Exemple #3
0
        /// <summary>
        /// Builds <see cref="LeafNodeDictionary"/> and <see cref="LeafNodeDictionaryEntryNGram"/>s for each document, to allow performance optimization
        /// </summary>
        /// <param name="documents">The documents.</param>
        /// <param name="leafSelectXPath">The leaf select x path, leave blank to use from settings, <see cref="DocumentSimilaritySettings.XPathToSelectLeafs"/></param>
        /// <param name="tagsToIgnore">The tags to ignore, leave unspecified to use from settings, <see cref="DocumentSimilaritySettings.TagsToIgnore"/>.</param>
        /// <returns></returns>
        public DocumentSimilarityResult Prepare(IEnumerable <HtmlNode> documents, String leafSelectXPath = "", List <String> tagsToIgnore = null)
        {
            leafSelectXPath = leafSelectXPath.or(settings.XPathToSelectLeafs, LeafNodeDictionary.DefaultNodeSelectionXPath);
            tagsToIgnore    = tagsToIgnore.or(settings.TagsToIgnore, LeafNodeDictionary.DefaultTagsToIgnore);

            DocumentSimilarityResult result = new DocumentSimilarityResult();

            frequencyCounter <String> xpathCounter = new frequencyCounter <string>();

            Dictionary <HtmlNode, LeafNodeDictionary> leafDictionary = new Dictionary <HtmlNode, LeafNodeDictionary>();


            foreach (HtmlNode documentA in documents)
            {
                LeafNodeDictionary leafNodeDictionaryA = new LeafNodeDictionary(documentA, leafSelectXPath, tagsToIgnore);
                if (leafNodeDictionaryA.items.Count < 5)
                {
                }
                foreach (var entry in leafNodeDictionaryA.items)
                {
                    xpathCounter.Count(entry.XPath);
                }
                leafDictionary.Add(documentA, leafNodeDictionaryA);
            }

            var commonXPaths = xpathCounter.GetItemsWithTopFrequency();

            foreach (var pair in leafDictionary)
            {
                pair.Value.RemoveEntriesByXPath(commonXPaths);
            }

            foreach (HtmlNode documentA in documents)
            {
                try
                {
                    LeafNodeDictionary leafNodeDictionaryA = leafDictionary[documentA];

                    List <LeafNodeDictionaryEntryNGram> nGrams_A = setAnalysisTools <LeafNodeDictionaryEntry> .getNGramSet <LeafNodeDictionaryEntryNGram>(leafNodeDictionaryA.items, settings.nGramWidth, settings.nGramMode);

                    result.DocumentsByLeafDictionary.Add(leafNodeDictionaryA, documentA);
                    result.DocumentsByNGrams.Add(nGrams_A, documentA);
                    result.LeafDictionaryByDocuments.Add(documentA, leafNodeDictionaryA);
                    result.NGramsByDocuments.Add(documentA, nGrams_A);
                } catch (Exception ex)
                {
                    result.DocumentsWithExceptions.Add(documentA, ex);
                }
            }
            return(result);
        }
Exemple #4
0
        public DirectedGraphWithSourceData Publish(folderNode folder, String name)
        {
            TagCounter     = new frequencyCounter <string>();
            NodeTagCounter = new frequencyCounter <string>();
            RebuildIndex();


            String listPath       = folder.pathFor("nd_" + name + "_list.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "List of selected nodes");
            String statPath       = folder.pathFor("nd_" + name + "_stats.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Stats of selected nodes");
            String graphPath      = folder.pathFor("nd_" + name + "_graph.dgml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Structure graph");
            String graphStatsPath = folder.pathFor("nd_" + name + "_graph_stats.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Stats of the complete graph");

            StringBuilder listBuilder      = new StringBuilder();
            StringBuilder statBuilder      = new StringBuilder();
            StringBuilder graphStatBuilder = new StringBuilder();

            foreach (var item in items)
            {
                listBuilder.AppendLine(item.XPath);
                TagCounter.Count(item.node.Name);
            }


            var freqBins = TagCounter.GetFrequencyBins();

            foreach (var bin in freqBins)
            {
                statBuilder.AppendLine(bin.Key + " " + bin.Value.toCsvInLine());
            }



            var GraphFreqBins = NodeTagCounter.GetFrequencyBins();

            foreach (var bin in GraphFreqBins)
            {
                graphStatBuilder.AppendLine(bin.Key + " " + bin.Value.toCsvInLine());
            }

            DirectedGraphWithSourceData dgml = BuildDGML();

            dgml.Save(graphPath, imbSCI.Data.enums.getWritableFileMode.overwrite);

            File.WriteAllText(listPath, listBuilder.ToString());
            File.WriteAllText(statPath, statBuilder.ToString());
            File.WriteAllText(graphStatsPath, graphStatBuilder.ToString());

            return(dgml);
        }
        /// <summary>
        /// Returns frequency counts on non-empty <see cref="templateFieldDataTable"/> column meta-setters for all columns in the table
        /// </summary>
        /// <param name="table">The table.</param>
        /// <returns></returns>
        public static frequencyCounter <templateFieldDataTable> GetColumnSetupStatistics(this DataTable table)
        {
            frequencyCounter <templateFieldDataTable> output = new frequencyCounter <templateFieldDataTable>();

            foreach (DataColumn dc in table.Columns)
            {
                foreach (var k in dc.ExtendedProperties.Keys)
                {
                    if (k is templateFieldDataTable enumField)
                    {
                        if (!dc.ExtendedProperties[k].isNullOrEmpty())
                        {
                            output.Count(enumField);
                        }
                    }
                }
            }

            return(output);
        }
        internal override void Compute()
        {
            base.Compute();

            if (dominantType == CellContentType.numeric)
            {
                frequencyCounter <String> SymbolCounters = new frequencyCounter <String>();

                frequencyCounter <Int32> RightPositionOfComma = new frequencyCounter <Int32>();

                frequencyCounter <Int32> RightPositionOfDot = new frequencyCounter <Int32>();

                foreach (CellContentInfo info in ContentInfos)
                {
                    Int32 commaPos = info.content.LastIndexOf(',');
                    if (commaPos > -1)
                    {
                        commaPos = (info.content.Length - 1) - commaPos;
                        RightPositionOfComma.Count(commaPos);
                    }

                    Int32 dotPos = info.content.LastIndexOf('.');
                    if (dotPos > -1)
                    {
                        dotPos = (info.content.Length - 1) - dotPos;
                        RightPositionOfDot.Count(dotPos);
                    }
                }

                Int32 comma_pos = RightPositionOfComma.GetMostFrequentItem();
                Int32 dot_pos   = RightPositionOfDot.GetMostFrequentItem();

                if (RightPositionOfComma.Any() && RightPositionOfDot.Any())
                {
                    if (comma_pos > dot_pos)
                    {
                        thousantDelimiter = ",";
                        decimalDelimiter  = ".";
                        decimalPlaces     = dot_pos - 1;
                    }
                    else
                    {
                        thousantDelimiter = ".";
                        decimalDelimiter  = ",";
                        decimalPlaces     = comma_pos - 1;
                    }
                }
                else if (RightPositionOfComma.Any())
                {
                    decimalDelimiter = ",";
                    decimalPlaces    = comma_pos;
                }
                else if (RightPositionOfDot.Any())
                {
                    decimalDelimiter = ".";
                    decimalPlaces    = dot_pos;
                }
                else
                {
                    thousantDelimiter = "";
                    decimalDelimiter  = "";
                    decimalPlaces     = 0;
                }



                if (!decimalDelimiter.isNullOrEmpty() && !thousantDelimiter.isNullOrEmpty())
                {
                    ValueTypeName = nameof(Decimal);
                }
                else if (!thousantDelimiter.isNullOrEmpty())
                {
                    ValueTypeName = nameof(Int32);
                }
                else if (!decimalDelimiter.isNullOrEmpty())
                {
                    ValueTypeName = nameof(Double);
                }
                else
                {
                    ValueTypeName = nameof(Int32);
                }
            }
        }