/// <summary>
        /// Renders the specified set of WebSiteDocuments into List of <see cref="TextDocumentSet"/>s
        /// </summary>
        /// <param name="input">The input.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public List <TextDocumentSet> RenderDocumentSet(WebSiteDocumentsSet input, ILogBuilder logger)
        {
            List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();
            Int32 target = input.Count;
            Int32 ti     = 0;

            foreach (WebSiteDocuments webSite in input)
            {
                //if (GroupSiteDocuments)
                //{
                TextDocumentSet textSet = RenderSiteDocuments(webSite, logger);
                textSetForLabel.Add(textSet);
                //} else
                //{
                //    foreach (WebSiteDocument webPage in webSite.documents)
                //    {
                //        TextDocumentSet textSet = new TextDocumentSet(webPage.AssociatedID);
                //        TextDocumentLayerCollection pg = RenderText(webPage, webSite);
                //        pg.name = webPage.AssociatedID;
                //        textSet.Add(pg);
                //        textSetForLabel.Add(textSet);
                //    }
                //}
                ti++;
                Double done = ti.GetRatio(target);
                logger.Append(" [" + done.ToString("P2") + "] ");
            }
            return(textSetForLabel);
        }
        /// <summary>
        /// Features the vector construction.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void FeatureVectorConstruction(OperationContext context, ILogBuilder log)
        {
            // deploying feature vector space constructor
            featureSpaceConstructor.Deploy(constructorSettings, context.vectorSpace);
            featureSpaceConstructor.Deploy(constructorSettings, context.SelectedFeatures.GetKeys());

            Int32 i = 0;
            Int32 s = 100;

            foreach (IVector vector in context.vectorSpace.documents)
            {
                var lab = designateSpaceLabel(context, vector);

                FeatureVector fv = featureSpaceConstructor.ConstructFeatureVector(vector);

                context.featureVectorByName.Add(vector.name, fv);

                context.featureSpace.documents.Add(fv);
                context.featureSpace.labelToDocumentAssociations.Add(fv, lab, 1);
                if (i % s == 0)
                {
                    Double r = i.GetRatio(context.spaceModel.documents.Count);
                    log.log("Building feature vectors [" + r.ToString("P2") + "] : [" + i + "/" + context.vectorSpace.documents.Count + "]");
                }
                i++;
            }

            log.log("Feature vector construction [" + context.featureSpace.documents.Count + "] done");

            //if (context.reportOptions.HasFlag(PlanesReportOptions.report_featureVectors))
            //{
            //    var dt = context.featureSpace.MakeTable(featureSpaceConstructor, "FeatureSpace", "Feature space");
            //    notes.SaveDataTable(dt, notes.folder_feature);
            //}
        }
Exemplo n.º 3
0
        /// <summary>
        /// Gets the continual overlap r.
        /// </summary>
        /// <param name="A">a.</param>
        /// <param name="B">The b.</param>
        /// <returns></returns>
        private static Double getContinualOverlapR(List <String> A, List <String> B)
        {
            Int32 cc = 0;

            Boolean synced = false;
            Int32   start  = 0;

            for (int a_i = 0; a_i < A.Count; a_i++)
            {
                if (A[a_i] == B.First())
                {
                    start  = a_i;
                    synced = true;

                    break;
                }
            }

            if (synced)
            {
                for (int i = start; i < Math.Min(A.Count, B.Count); i++)
                {
                    if (A[i] == B[i])
                    {
                        cc++;
                    }
                    else
                    {
                        break;
                    }
                }
            }

            return(cc.GetRatio(Math.Max(A.Count, B.Count)));
        }
        public List <HtmlNode> AdaptiveRowSelection(HtmlNode divNode, Int32 steps = 5)
        {
            Dictionary <Double, List <HtmlNode> > selectionByRatio = new Dictionary <double, List <HtmlNode> >();
            HtmlNode        head          = divNode;
            Double          bestRatio     = Double.MinValue;
            List <HtmlNode> bestSelection = null;

            for (int i = 0; i < steps; i++)
            {
                if (head == null)
                {
                    break;
                }
                if (!head.Name.Equals("div", StringComparison.InvariantCultureIgnoreCase))
                {
                    break;
                }
                List <HtmlNode> html_tablerows = head.SelectNodesInDepthRange(x => x.Name.Equals(TableSelectionTag, StringComparison.InvariantCultureIgnoreCase), TableSelectionDepthLimit, TableSelectionDepthStart, false);

                Double rate = 0;

                Int32 rows    = html_tablerows.Count;
                Int32 columns = Int32.MaxValue;
                if (html_tablerows.Count > 0)
                {
                    foreach (var r in html_tablerows)
                    {
                        var html_cells = r.SelectNodesInDepthRange(
                            x => x.Name.Equals(RowSelectionTag, StringComparison.InvariantCultureIgnoreCase) &&
                            !x.ChildNodes.Any(y => y.Name.Equals(RowSelectionTag, StringComparison.InvariantCultureIgnoreCase))
                            , RowSelectionDepthLimit, RowSelectionDepthStart, false);

                        columns = Math.Min(columns, html_cells.Count);
                    }

                    if (columns == Int32.MaxValue)
                    {
                        rate = 0;
                    }
                    else
                    {
                        rate = rows.GetRatio(columns);
                    }
                }

                if (!selectionByRatio.ContainsKey(rate))
                {
                    selectionByRatio.Add(rate, html_tablerows);
                }

                head = head.ParentNode;
                if (rate > bestRatio)
                {
                    bestRatio     = rate;
                    bestSelection = html_tablerows;
                }
            }

            return(bestSelection);
        }
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Category Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();

            foreach (SpaceLabel label in context.spaceModel.labels)
            {
                Relationship <SpaceLabel, SpaceCategoryModel> categoryModel = context.spaceModel.LabelToCategoryLinks.GetAllRelationships(label).FirstOrDefault();

                var c = TermWeightModel.GetWeights(selectedTerms, categoryModel.NodeB, context.spaceModel, label);
                categoryDictionarties.Add(label.name, c);
            }

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();

            String domainNameLast = "";

            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 20);

            foreach (var entry in context.items)
            {
                i++;

                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);

                FeatureVector fv = new FeatureVector(entry.AssignedID);
                fv.dimensions = new double[context.spaceModel.labels.Count];

                Int32 c = 0;

                Parallel.ForEach(context.spaceModel.labels, (label) =>
                {
                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[label.name], documentWeights);
                    fv.dimensions[context.spaceModel.labels.IndexOf(label)] = docToClassSimilarity;
                });


                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }


                dict.GetOrAdd(entry.DomainID).Add(fv, -1);
            }

            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation done...");
            return(dict);
        }
Exemplo n.º 6
0
        public Double GetScore()
        {
            Int32 part = 0;

            Int32 whole = validations.Count;

            foreach (var pair in validations)
            {
                switch (pair.Value.Outcome)
                {
                case ValidationOutcome.Invalid:
                    break;

                case ValidationOutcome.Modified:
                case ValidationOutcome.Validated:

                    part++;
                    break;

                case ValidationOutcome.undefined:
                    break;
                }
            }

            //distinct.Count.GetRatio(nGrams.Count);

            Double score = part.GetRatio(whole, 0, 0);

            return(score);
        }
Exemplo n.º 7
0
        public Double GetRatioForScale(Int32 scaleStep, Double floor = 0, Int32 scaleSteps = -1)
        {
            if (scaleSteps == -1)
            {
                scaleSteps = xKeys.Count;
            }

            Double val = scaleStep.GetRatio(scaleSteps);

            if (floor == 0)
            {
                return(val);
            }

            val = val / (1 + floor);
            val = val + floor;
            if (val > 1)
            {
                val = 1;
            }

            //if (scaleStep == 0) return ranger.Minimum;

            //if (scaleStep == scaleSteps) return ranger.Maximum;

            //if (ranger == null) return absValue;

            //absValue = absValue - ranger.Minimum;
            //absValue = absValue.GetRatio(ranger.Range);// * ranger.Maximum;
            return(val);
        }
        /// <summary>
        /// Reduces the document set.
        /// </summary>
        /// <param name="docSet">The document set - web site.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns>Rate of reduction</returns>
        public Double ReduceDocumentSet(WebSiteDocuments docSet, HtmlDocumentReductionSettings settings, ILogBuilder logger)
        {
            Int32 input  = 0;
            Int32 output = 0;

            foreach (WebSiteDocument document in docSet.documents)
            {
                input += document.HTMLSource.Length;

                String newHtml = ReduceDocument(document.HTMLSource, settings, logger);

                output += newHtml.Length;


                document.HTMLSource = newHtml;
            }

            Double reduction = output.GetRatio(input);

            if (settings.logSiteLevel)
            {
                logger.AppendLine("[" + docSet.domain + "] reduced to: " + reduction.ToString("P2"));
            }

            return(reduction);
        }
        /// <summary>
        /// Gets the index of the Jaccard index: number of common ngrams divided by number of total unique ngrams
        /// </summary>
        /// <param name="ngrams_A">The ngrams a.</param>
        /// <param name="ngrams_B">The ngrams b.</param>
        /// <returns></returns>
        public Double GetJaccardIndex(List <T> ngrams_A, List <T> ngrams_B)
        {
            List <T> allNGrams = GetJoinElements(ngrams_A, ngrams_B);

            Int32 common = CountContains(ngrams_A, ngrams_B); // ngrams_A.Count(x => Contains(ngrams_b, x)); // ngrams_b.Contains(x));


            return(common.GetRatio(allNGrams.Count));
        }
Exemplo n.º 10
0
        public ITextRender Report(ITextRender output = null)
        {
            if (output == null)
            {
                output = new builderForMarkdown();
            }

            var scores = items.Select(x => x.score);

            output.AppendHeading("Granularity");

            var   distinct = items.GetDistinctScores();
            Int32 dC       = distinct.Count();

            output.AppendPair("Distinct", dC);
            output.AppendPair("Entries", scores.Count());
            Double r = (Double)dC.GetRatio(scores.Count());

            output.AppendPair("Distinct / Entries", r);

            output.AppendHeading("Cumulative histogram");


            for (int i = 1; i < 11; i++)
            {
                Double l_min = (i - 1).GetRatio(10);
                Double l_max = i.GetRatio(10);
                var    bin   = scores.Where(x => (x > l_min) && (x < l_max));
                Double per   = bin.Count().GetRatio(scores.Count());
                output.AppendPair("Bin [" + i + "][" + l_max.ToString("F2") + "]", per.ToString("P2"));
            }

            output.AppendHeading("Descriptive statistics");

            DescriptiveStatistics desc = scores.GetStatistics(true);

            desc.Describe(output);



            output.AppendHeading("Document selection result");

            foreach (DocumentSelectResultEntry result in items)
            {
                output.AppendLine(result.score.ToString("F5") + "\t\t" + result.AssignedID);
            }

            output.AppendHorizontalLine();

            query.Describe(output);

            output.AppendHorizontalLine();

            return(output);
        }
        /// <summary>
        /// Gets the index of the Jaccard index: number of common ngrams divided by number of total unique ngrams
        /// </summary>
        /// <param name="ngrams_A">The ngrams a.</param>
        /// <param name="ngrams_b">The ngrams b.</param>
        /// <returns></returns>
        public static Double GetJaccardIndex(List <String> ngrams_A, List <String> ngrams_b)
        {
            List <String> allNGrams = new List <string>();

            Int32 common = ngrams_A.Count(x => ngrams_b.Contains(x));

            allNGrams.AddRange(ngrams_A);
            allNGrams.AddRange(ngrams_b, true);

            return(common.GetRatio(allNGrams.Count));
        }
Exemplo n.º 12
0
        /// <summary>
        /// Builds vectors from selected features and feature weighting model
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false)
        {
            List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList();

            //FV.AddRange();

            log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "]");
            // preparing the model
            weightModel.PrepareTheModel(context.spaceModel, log);


            Int32 i = 0;
            Int32 s = context.spaceModel.documents.Count / 10;

            // building document VSM
            foreach (SpaceDocumentModel docModel in context.spaceModel.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.spaceModel);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                context.vectorSpace.documents.Add(docVec);
                if (i % s == 0)
                {
                    Double r = i.GetRatio(context.spaceModel.documents.Count);
                    log.log("[" + r.ToString("F2") + "]");
                }
                i++;
            }

            if (constructCategories)
            {
                // logger.log(":: Creating VectorSpace instances for categories");
                // building category VSM
                foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
                {
                    var         wd     = weightModel.GetWeights(FV, catModel, context.spaceModel);
                    VectorLabel catVec = new VectorLabel(catModel.name);
                    catVec.terms = wd;


                    context.vectorSpace.labels.Add(catVec);
                }
            }
        }
        /// <summary>
        /// Returns rate and which the document fits the fingerprint
        /// </summary>
        /// <param name="document">The document.</param>
        /// <returns></returns>
        public Double Evaluate(HtmlNode document)
        {
            Int32 m = 0;

            foreach (String xPath in XPathList)
            {
                var node = document.SelectSingleNode(xPath);
                if (node == null)
                {
                }
                else
                {
                    m++;
                }
            }

            return(m.GetRatio(XPathList.Count));
        }
Exemplo n.º 14
0
        /// <summary>
        /// Renders HeatMapModel of specified size
        /// </summary>
        /// <param name="width">The width.</param>
        /// <param name="height">The height.</param>
        /// <returns></returns>
        public HeatMapModel MakeHeatMap(Int32 width, Int32 height, Int32 xPeriod = 20, Int32 yPeriod = 20)
        {
            xAxisFunction.outputRange = new imbNumberScale(numberRangePresetEnum.zeroToOne);
            yAxisFunction.outputRange = new imbNumberScale(numberRangePresetEnum.zeroToOne);

            HeatMapModel map = HeatMapModel.Create(width, height, "D3");

            map.AllocateSize(width, height);

            for (Int32 y = 0; y < height; y++)
            {
                Double yValue = yAxisFunction.GetOutput(y.GetRatio(yPeriod));
                for (Int32 x = 0; x < width; x++)
                {
                    map[x, y] = xAxisFunction.GetOutput(x.GetRatio(xPeriod)) + yValue;  //grayImage.GetIntesity(x, y, bs);
                }
            }

            return(map);
        }
        /// <summary>
        /// Reduces the dataset, returns total reduction score (%)
        /// </summary>
        /// <param name="dataSet">The data set.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public double ReduceDataset(IEnumerable <WebSiteDocumentsSet> dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger)
        {
            logOutputStart = logger.Length;

            if (!dataSet.Any())
            {
                throw new ArgumentException("The specified dataset is empty!", nameof(dataSet));
            }

            List <Double> reductions = new List <double>();

            List <Double> html_reductions = new List <double>();

            Int32 total_input  = 0; // dataSet.CountDocumentsTotal();
            Int32 total_output = 0;

            foreach (WebSiteDocumentsSet category in dataSet)
            {
                total_input += category.CountDocumentsTotal();
                reductions.Add(ReduceDatasetCategory(category, settings, logger));

                html_reductions.Add(htmlEngine.ReduceDatasetCategory(category, settings.HtmlDocumentReduction, logger));

                total_output += category.CountDocumentsTotal();
                // reductions.Add(ReduceDatasetCategory(category, settings, logger));
            }

            Double average      = reductions.Average();
            Double reduction    = total_output.GetRatio(total_input);
            Double average_html = html_reductions.Average();

            reductionScore = (average_html * reduction);

            logger.log("Dataset document count reduced: " + reduction.ToString("P2"));
            logger.log("Dataset document size reduced (avg): " + average_html.ToString("P2"));

            logger.log("Total reduction score: " + reductionScore.ToString("P2"));

            return(reductionScore);
        }
        /// <summary>
        /// Evaluates the ds ranking.
        /// </summary>
        /// <param name="ds_loaded">The ds loaded.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="filepath">The filepath.</param>
        /// <param name="minDiversity">The minimum diversity.</param>
        /// <returns></returns>
        public static Boolean EvaluateDSRanking(DocumentSelectResult ds_loaded, ILogBuilder logger, String filepath = "", Double minDiversity = 0.01)
        {
            if (filepath == "")
            {
                filepath = ds_loaded.name;
            }

            var   distinct = ds_loaded.items.GetDistinctScores();
            Int32 c        = distinct.Count();

            Boolean skip = true;



            if (distinct.Contains(Double.NaN))
            {
                logger.log("Ranking scores [" + filepath + "] is refused as it contains NaN entries");
                return(false);
            }

            if (c < 2)
            {
                logger.log("Ranking scores [" + filepath + "] is refused as it contains [" + c + "] distinct values");
                return(false);
            }

            Double rate = c.GetRatio(ds_loaded.items.Count());

            if (rate < minDiversity)
            {
                logger.log("Ranking scores [" + filepath + "] is refused for having [" + rate.ToString("F5") + "] below criterion [" + minDiversity.ToString("F2") + "]");
                return(false);
            }


            logger.log("Ranking scores [" + filepath + "] accepted d=[" + rate.ToString("F5") + "] c=[" + distinct.Count + "] |e|=[" + ds_loaded.items.Count + "]");
            return(true);
        }
Exemplo n.º 17
0
        /// <summary>
        /// Gets the first name of the significant parent by node.
        /// </summary>
        /// <param name="node">The node.</param>
        /// <param name="significanceLevel">The significance level: rade of occurence, less value more significant/rare the tag is.</param>
        /// <returns></returns>
        public HtmlNode GetFirstSignificantParentByNodeName(HtmlNode node, Double significanceLevel = 0.2)
        {
            HtmlNode head    = node;
            Double   rate    = 1;
            Int32    topFreq = NodeTagCounter.GetTopFrequency();

            while (rate > significanceLevel)
            {
                if (head.ParentNode == null)
                {
                    return(head);
                }

                Int32 freq = NodeTagCounter.GetFrequencyForItem(head.Name);
                if (freq == 0)
                {
                    return(head);
                }
                rate = freq.GetRatio(topFreq);

                head = head.ParentNode;
            }
            return(head);
        }
Exemplo n.º 18
0
        /// <summary>
        /// Loads the lexic resource.
        /// </summary>
        /// <param name="output">The output.</param>
        /// <param name="resourceFilePath">The resource file path.</param>
        public void LoadLexicResource(ILogBuilder output, String resourceFilePath)
        {
            List <String> lines = new List <String>();

            // <---------------------------------------------- [
            if (isLoaded)
            {
                return;
            }
            String pt = "";

            if (!localCache.isNullOrEmpty())
            {
                pt = localCache;
                lines.AddRange(File.ReadLines(localCache));
            }

            if (lines.Count < 100)
            {
                pt    = resourceFilePath;
                lines = new List <string>();
                lines.AddRange(File.ReadAllLines(resourceFilePath));
            }

            Int32  i      = 0;
            Int32  iCycle = lines.Count() / 20;
            Int32  l      = lines.Count();
            Int32  c      = 0;
            Double p      = 0;

            output.logStartPhase("Loading", "Loading the lexic resource - with mode: " + mode.ToString());
            output.log("Start of loading lexic resource [" + pt + "]");
            //   Parallel.ForEach(lines, new ParallelOptions { MaxDegreeOfParallelism=1 }, (line) =>

            Parallel.ForEach(lines, new ParallelOptions {
                MaxDegreeOfParallelism = 1
            }, (line) =>
                             //  Parallel.ForEach(lines, (line) =>
            {
                string inflectForm = "";
                string lemma       = "";
                string gramTag     = "";

                SelectFromLine(line, out inflectForm, out lemma, out gramTag);

                lexicInflection inflect = null;

                if (!inflectForm.isNullOrEmpty())
                {
                    if (!ContainsKey(inflectForm))
                    {
                        inflect                       = new lexicInflection(line);
                        inflect.lemmaForm             = lemma;
                        inflect.name                  = inflectForm;
                        inflect.inflectedForm         = inflectForm;
                        inflect.lexicalDefinitionLine = line;

                        if (spellAlternator.IsInitiated)
                        {
                            String altInflectedForm = spellAlternator.ConvertFromAtoB(inflectForm);
                            spellAlternatives.GetOrAdd(altInflectedForm, inflectForm);
                        }

                        Add(inflectForm, inflect);
                    }
                    else
                    {
                        inflect = base[inflectForm];
                    }

                    lexicGrammarCase gramCase = null;

                    if (mode == textResourceIndexResolveMode.resolveOnLoad)
                    {
                        var gramTagColl = grammTagConverter.ConvertFromString(gramTag);

                        gramCase = inflect.AddGrammarCase(gramTagColl);
                        gramCase.lexicalDefinitionLine = gramTag;
                    }
                    else
                    {
                        gramCase = new lexicGrammarCase();
                        gramCase.lexicalDefinitionLine = gramTag;
                        gramCase.name = "gc" + i.ToString();
                        inflect.Add(gramCase);
                    }

                    // <----------------- construction of Lemma centered dictionary

                    lexicGraphSetWithLemma lxSet = null;

                    if (!registratedLemmaIndex.ContainsKey(lemma))
                    {
                        lock (LemmaIndexLock)
                        {
                            if (!registratedLemmaIndex.ContainsKey(lemma))
                            {
                                lxSet           = new lexicGraphSetWithLemma();
                                lxSet.lemmaForm = lemma;
                                registratedLemmaIndex.TryAdd(lemma, lxSet);
                            }
                        }
                    }

                    lxSet = registratedLemmaIndex[lemma];

                    if (!lxSet.ContainsKey(inflectForm))
                    {
                        lock (SetLock)
                        {
                            if (!lxSet.ContainsKey(inflectForm))
                            {
                                lxSet.TryAdd(inflect.name, inflect);
                            }
                        }
                    }

                    Interlocked.Increment(ref c);
                    Interlocked.Increment(ref i);
                    if (c > iCycle)
                    {
                        lock (loadStatusLock)
                        {
                            if (c > iCycle)
                            {
                                c = 0;
                                p = i.GetRatio(l);
                                output.AppendLine("Done: _" + p.ToString("P2") + "_");
                            }
                        }
                    }
                }
            });

            output.logEndPhase();
            output.log("End of loading process");
            isLoaded = true;
        }
Exemplo n.º 19
0
        public List <String> DescribeSelf(List <String> output = null)
        {
            if (output == null)
            {
                output = new List <string>();
            }

            output.Add("Experiment [" + experiment.name + "] done in: " + Duration.ToString("F2") + " minutes");
            output.Add(context.setup.description);

            //    context.validationCollections.Count

            output.Add("k-Fold cross validation k[" + experiment.validationSetup.k + "] - RND(T/E)SMP[" + experiment.validationSetup.randomize.ToString() + "] - FVE models [" + experiment.models.Count + "] - Classiffiers [" + experiment.classifiers.Count + "]");
            Int32  nCats       = 0;
            Int32  nCases      = 0;
            Double nCasePerCat = 0;

            foreach (var c in context.classes.GetClasses())
            {
                nCats++;
                nCases += c.WebSiteSample.Count();
            }

            nCasePerCat = nCases.GetRatio(nCats);

            output.Add("Categories [" + nCats + "] with [" + nCases + "] -- cases per category [" + nCasePerCat.ToString("F2") + "]");

            var model = context.tools.model as pipelineMCRepo.model.mcRepoProcessModel;

            output.Add("Pages per web site (limit) [" + model.setup.target_languagePagePerSite + "]");

            foreach (var m in context.setup.models)
            {
                String ln = m.name.TrimToMaxLength(15);

                foreach (var fv in m.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        ln = ln.add("[" + fv.name.TrimToMaxLength(10, " ") + "]", " ");
                    }
                    else
                    {
                        ln = ln.add("[" + ("-".Repeat(10)) + "]", " ");
                    }
                }
            }



            output.Add("----");

            output.Add("The best classifier per FVE models, by cross k-fold mean of F1 (macro-average): ");

            output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}]", "Feature Vector Model", "Top class.", "Macro F1"));

            foreach (var cl in bestPerformingClassifiers)
            {
                if (cl == theBestPerformer)
                {
                    output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}] <-- the best ", cl.Name, cl.Classifier, cl.F1measure));
                }
                else
                {
                    output.Add(String.Format("[{0,-30}] [{1,10}] [{2,10:F5}]", cl.Name, cl.Classifier, cl.F1measure));
                }
            }

            output.Add("----");

            output.Add("The best performer: ");

            output.Add("Name: " + theBestPerformer.Name);
            output.Add("Classifier: " + theBestPerformer.Classifier);
            output.Add("F1 measure: " + theBestPerformer.F1measure.ToString("F5"));

            output.Add("----");

            output.Add("The FVE with highest S1 measure: ");
            output.Add("Name: " + bestModel.modelName);
            output.Add("Range width:    " + bestModel.RangeWidthAvg.ToString("F5"));
            output.Add("Range position: " + bestModel.RangePositionAvg.ToString("F5"));
            output.Add("S1 measure:     " + bestModel.S1Measure.ToString("F5"));



            output.Add("----");

            output.Add("Mean classifier performances by FVE models: ");


            DocumentSetCaseCollectionReport minMean = new DocumentSetCaseCollectionReport();

            minMean.F1measure = 1;
            DocumentSetCaseCollectionReport maxMean = new DocumentSetCaseCollectionReport();

            maxMean.F1measure = 0;

            foreach (var cl in meanPerformanceForExtractors)
            {
                if (cl.F1measure <= minMean.F1measure)
                {
                    minMean = cl;
                }
                if (cl.F1measure > maxMean.F1measure)
                {
                    maxMean = cl;
                }
            }

            foreach (var cl in meanPerformanceForExtractors)
            {
                String lb = " --- ";
                if (cl == minMean)
                {
                    lb = " min ";
                }
                if (cl == maxMean)
                {
                    lb = " max ";
                }

                output.Add(String.Format("[{0,-30}] P[{1,10:F5}] R[{2,10:F5}] F1[{3,10:F5}] [{4,5}]", cl.Name, cl.Precision, cl.Recall, cl.F1measure, lb));
            }

            output.Add(" --- FVE cross-classifier means are computed as quality infication for FVE's configuration");

            output.Add(" --- FVE models and k-fold sample distribution MD5 hash");

            foreach (var c in valColVsModelVsSampleHash)
            {
                output.Add(c);
            }

            return(output);
        }
        /// <summary>
        /// Reduces the dataset category.
        /// </summary>
        /// <param name="dataSet">The data set.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public double ReduceDatasetCategory(WebSiteDocumentsSet dataSet, WebSiteDataSetReductionSettings settings, ILogBuilder logger)
        {
            //List<Double> reductions = new List<double>();

            Int32 total_input = dataSet.CountDocumentsTotal();

            List <WebSiteGraphDiagnosticMark> marks = new List <WebSiteGraphDiagnosticMark>();

            if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none)
            {
                marks = settings.marksToRemove.getEnumListFromFlags <WebSiteGraphDiagnosticMark>();
            }

            List <WebSiteDocuments> toRemove = new List <WebSiteDocuments>();

            foreach (WebSiteDocuments site in dataSet)
            {
                if (settings.marksToRemove != WebSiteGraphDiagnosticMark.none)
                {
                    if (site.extensions.graph == null)
                    {
                        if (settings.logSiteLevel)
                        {
                            logger.log("Site _ [" + site.domain + "] _ flaged for removal because not having graph declared");
                        }
                    }
                    else
                    {
                        foreach (WebSiteGraphDiagnosticMark mark in marks)
                        {
                            if (site.extensions.graph.diagnosticResults.HasFlag(mark))
                            {
                                if (settings.logSiteLevel)
                                {
                                    logger.log("Site _ [" + site.domain + "] _ flaged for removal because of [" + mark.ToString() + "] web graph diagnostic mark");
                                }

                                toRemove.Add(site);
                            }
                        }
                    }
                }
            }

            foreach (WebSiteDocuments site in toRemove)
            {
                if (dataSet.Contains(site))
                {
                    dataSet.Remove(site);
                }
            }

            dataSet.RemoveEmptyDocuments(logger, settings.LimitSettings.minPageLimit, settings.LimitSettings.maxPageLimit);

            Int32 total_output = dataSet.CountDocumentsTotal();

            Double average = total_output.GetRatio(total_input);

            if (settings.logCategoryLevel)
            {
                logger.log("Document count in _ [" + dataSet.name + "] _ reduced to: " + average.ToString("P2"));
            }

            return(average);
        }
Exemplo n.º 21
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="tableName">Name of the table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false)
        {
            if (table == null)
            {
                table = new webLemmaTermTable(tableName);
            }

            if (table.Count > 0)
            {
                logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]");
                if (DoBeep == 1)
                {
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250);
                    Interlocked.Increment(ref DoBeep);
                }
                return(table);
            }


            List <String> tfdfList = counter.GetIndexForms();
            Int32         i        = 0;
            Int32         c        = 0;
            Int32         li       = 0;
            Int32         limit    = tfdfList.Count + 100;



            if (!tableName.isNullOrEmpty())
            {
                table.name = tableName;
            }


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();

            Int32 startIndex  = tfdfList.Count;
            Int32 cycleLength = startIndex / 5;

            while (tfdfList.Any())
            {
                String term = tfdfList.FirstOrDefault();
                Int32  d    = tfdfList.Count;

                if (term != null)
                {
                    lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger);
                    d = d - tfdfList.Count;
                    if (d == 0)
                    {
                        table.unresolved.Add(term);
                        tfdfList.Remove(term);
                        d = 1;
                    }
                    else
                    {
                        Boolean ok = true;

                        if (settings.allowedLemmaTypes.Any())
                        {
                            var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none);

                            if (settings.strictPosTypePolicy)
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                    if (tps.Contains(pos_type.V))
                                    {
                                        ok = false;
                                    }
                                    //foreach (pos_type t in tps)
                                    //{
                                    //    if (!settings.allowedLemmaTypes.Contains(t))
                                    //    {
                                    //        ok = false;
                                    //        break;
                                    //    }
                                    //}
                                }
                            }
                            else
                            {
                                if (!tps.ContainsAny(settings.allowedLemmaTypes))
                                {
                                    ok = false;
                                }
                                else
                                {
                                }
                            }
                        }
                        else
                        {
                        }


                        if (ok)
                        {
                            List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();
                            List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();



                            webLemmaTerm lemma = new webLemmaTerm();
                            lemma.nominalForm = inflectSet.lemmaForm;
                            lemma.name        = inflectSet.lemmaForm;


                            Double documentFrequency = 0;
                            Double termFrequency     = 0;

                            foreach (lexicInflection inflect in inflectSet.Values)
                            {
                                TFDFContainer cn = counter.GetContainer(inflect.inflectedForm);
                                if (cn != null)
                                {
                                    lemma.AFreqPoints += cn.items.Count;
                                    foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                                    {
                                        imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>();
                                        documents.AddUnique(document);

                                        imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement;
                                        if (docSet != null)
                                        {
                                            documentSet.AddUnique(docSet);
                                        }
                                        else
                                        {
                                            logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")");
                                        }

                                        if (cntPair.flagBag.Contains(cnt_containerType.link))
                                        {
                                            termFrequency += settings.anchorTextFactor;
                                        }
                                        else if (cntPair.flagBag.Contains(cnt_containerType.title))
                                        {
                                            termFrequency += settings.titleTextFactor;
                                        }
                                        else
                                        {
                                            termFrequency += settings.contentTextFactor;
                                        }

                                        cntPair.AddGraph(inflect);
                                    }

                                    lemma.otherForms.AddUnique(cn.indexForm);
                                }
                                else
                                {
                                    lemma.otherForms.AddUnique(inflect.inflectedForm);
                                }
                            }
                            lemma.documentSetFrequency = documentSet.Count;
                            lemma.documentFrequency    = documents.Count;
                            lemma.termFrequency        = termFrequency;
                            lemmas.Add(lemma);
                            //table.Add(lemma);
                        }
                        else
                        {
                        }
                    }
                }
                li++;
                i = i + d;
                c = c + d;
                d = startIndex - tfdfList.Count;

                if (c > cycleLength)
                {
                    c = 0;
                    logger.AppendLine();
                    logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_");
                    logger.AppendLine();
                }

                if (li > limit)
                {
                    logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]");
                    break;
                }
            }


            if (settings.doComputeTFIDF)
            {
                recompute(table, logger, forSingleWebSite, lemmas);
            }
            else
            {
                foreach (var le in lemmas)
                {
                    table.Add(le);
                }
            }


            //  table.ReadOnlyMode = true;

            return(table);
        }
Exemplo n.º 22
0
        public static DataRow BuildRow(this DocumentSetCaseCollection host, DocumentSetCase setCase, DataTable output, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true)
        {
            var setClass       = host.setClass;
            var validationCase = host.validationCase;

            DataRow dr = output.NewRow();

            dr["name"] = host.validationCase.name + "_" + setCase.subject.name;

            if (output.Columns.Contains("Origin"))
            {
                dr["Origin"] = host.setClass.name;
            }


            dr["Case"] = setCase.subject.name;

            if (!isTrainingCollection)
            {
                Int32 cor = 0;
                foreach (var cl in validationCase.context.setup.classifiers)
                {
                    String cName = "";
                    Int32  t     = 0;
                    if (setCase.data[cl].selected != null)
                    {
                        cName = setCase.data[cl].selected.name;
                        if (setCase.data[cl].selected.classID == host.rightClassID)
                        {
                            t = 1;
                        }
                        else
                        {
                            t = 0;
                        }
                    }
                    else
                    {
                        cName = "- not set -";
                    }
                    dr["ClassResultName" + cl.name] = cName;

                    cor += t;

                    dr["EvalTrue" + cl.name] = t;
                }

                dr["Correct"] = cor.GetRatio(validationCase.context.setup.classifiers.Count);
            }

            foreach (var cl in setCase.data.setClassCollection.GetClasses())
            {
                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr[fv.name + "_" + cl.treeLetterAcronim] = setCase.data.featureVectors[cl.classID][fv];
                    }
                }
            }



            if (doFVAnalysis)
            {
                // aceDictionary2D<String, String, rangeFinder> matrix = new aceDictionary2D<string, string, rangeFinder>();

                Dictionary <String, rangeFinderWithData> rangers = new Dictionary <string, rangeFinderWithData>();

                foreach (var cl in setCase.data.setClassCollection.GetClasses())
                {
                    foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                    {
                        if (fv.isActive)
                        {
                            if (!rangers.ContainsKey(fv.name))
                            {
                                rangers.Add(fv.name, new rangeFinderWithData(fv.name));
                            }

                            rangers[fv.name].Learn(setCase.data.featureVectors[cl.classID][fv]);
                        }
                    }
                }



                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr["FVRange" + fv.name]   = rangers[fv.name].doubleEntries.GetStdDeviation(false);
                        dr["CFV_Ratio" + fv.name] = rangers[fv.name].GetPositionInRange(setCase.data.featureVectors[setClass.classID][fv]);
                        // output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FV Metrics");
                    }
                    // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
                }
            }



            output.Rows.Add(dr);
            return(dr);
        }
        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ScoreComputationModeEnum groupmode, ILogBuilder log)
        {
            List <string> selectedTerms = context.selectedFeatures.GetKeys();

            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();


            foreach (var entry in context.items)
            {
                WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
                documentDictionarties.Add(entry.AssignedID, documentWeights);
            }


            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);



            Dictionary <string, List <DocumentSelectResultEntry> > relative_groups = null;


            if (groupmode == ScoreComputationModeEnum.category)
            {
                Dictionary <string, List <string> > assignIDByLabel = context.spaceModel.LabelToDocumentLinks.GetAllRelationShipByName(true);

                relative_groups = context.GetByAssignIDCategory(assignIDByLabel, log);
                if (assignIDByLabel.ContainsKey(SpaceLabel.UNKNOWN))
                {
                    assignIDByLabel.Remove(SpaceLabel.UNKNOWN);
                }
                log.log("... Page Similarity ... Groups by category");
            }
            else if (groupmode == ScoreComputationModeEnum.site)
            {
                relative_groups = context.GetByDomain(log);
                log.log("... Page Similarity ... Groups by site");
            }
            else if (groupmode == ScoreComputationModeEnum.dataset)
            {
                relative_groups = new Dictionary <string, List <DocumentSelectResultEntry> >();
                relative_groups.Add("dataset", context.items);
                log.log("... Page Similarity ... dataset");
            }


            ConcurrentDictionary <String, Double> computedPairs = new ConcurrentDictionary <string, double>();


            foreach (var domainPair in relative_groups)
            {
                List <DocumentSelectResultEntry> relatives = domainPair.Value; //relative_groups[domainPair.Key].ToList();


                foreach (var entry in relatives)
                {
                    i++;
                    FeatureVector fv = new FeatureVector(entry.AssignedID);

                    // List<Double> d = new List<>();

                    fv.dimensions = new double[relatives.Count - 1];


                    // List<String> keys = documentDictionarties.Keys.ToList();

                    Int32 hostInd = relatives.IndexOf(entry);

                    Int32 c = 0;


                    //foreach (var pair in documentDictionarties)
                    //{

                    Parallel.ForEach(relatives, (pair) =>
                    {
                        Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
                        if (ind >= hostInd)
                        {
                            ind = ind - 1;
                        }

                        if (pair.AssignedID != entry.AssignedID)
                        {
                            Double docToClassSimilarity = 0;

                            if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
                            }
                            else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                            {
                                docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
                            }
                            else
                            {
                                var vecA             = documentDictionarties[pair.AssignedID];
                                var vecB             = documentDictionarties[entry.AssignedID];
                                docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
                                if (docToClassSimilarity > 0)
                                {
                                }
                                if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
                                {
                                    computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
                                    //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
                                }
                                else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
                                {
                                    computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
                                }
                            }

                            fv.dimensions[ind] = docToClassSimilarity;
                        }
                    });



                    Int32 r = i % p;
                    if (r == 0)
                    {
                        log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                    }


                    dict.GetOrAdd(domainPair.Key).Add(fv, -1);
                }
            }



            log.log("... Preparation finished ...");

            return(dict);
        }
Exemplo n.º 24
0
 public void Compute()
 {
     sitesPerClass = sites.GetRatio(classes);
     pagesPerSite  = pages.GetRatio(sites);
 }
Exemplo n.º 25
0
        /// <summary>
        /// Constructs the webLemmaTable
        /// </summary>
        /// <param name="counter">The counter.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="table">The table.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <returns></returns>
        public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false)
        {
            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;

                            /*
                             * if (cntPair.flagBag.Contains(cnt_containerType.link))
                             * {
                             *  termFrequency += settings.anchorTextFactor;
                             * }
                             * else if (cntPair.flagBag.Contains(cnt_containerType.title))
                             * {
                             *  termFrequency += settings.titleTextFactor;
                             * }
                             * else
                             * {
                             *  termFrequency += settings.contentTextFactor;
                             * }*/

                            // lemma.otherForms.AddUnique(cntPair.initialForm);
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]");
                        break;
                    }
                }
            }


            // table.WriteOnlyMode = false;


            recompute(table, logger, forSingleWebSite, lemmas);


            // table.ReadOnlyMode = true;


            return(table);
        }
        /// <summary>
        /// Builds vectors from selected features and feature weighting model
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false)
        {
            List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList();

            //FV.AddRange();

            log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "] ");
            // preparing the model
            weightModel.PrepareTheModel(context.spaceModel, log);

            // blanking anything existing in vector space
            context.vectorSpace = new VectorSpace();

            List <SpaceDocumentModel> toBlendIntoVectors = DocumentBlenderFunctionExtension.GetDocumentToBlend(blender.options, context.spaceModel.documents, log);

            Int32 i = 0;
            Int32 s = toBlendIntoVectors.Count() / 5;


            Dictionary <String, List <VectorDocument> > labelToDocumentSets = new Dictionary <String, List <VectorDocument> >();


            foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
            {
                labelToDocumentSets.Add(catModel.name, new List <VectorDocument>());
            }

            Int32 unlabeled = 0;

            foreach (SpaceDocumentModel model in toBlendIntoVectors)
            {
                VectorDocument docVec = model.BlendToVector <VectorDocument>(weightModel, context.spaceModel, FV);  //new VectorDocument(model.name);
                context.vectorSpace.documents.Add(docVec);

                if (constructCategories)
                {
                    String l = model.labels.FirstOrDefault();

                    if (!l.isNullOrEmpty())
                    {
                        if (labelToDocumentSets.ContainsKey(l))
                        {
                            labelToDocumentSets[l].Add(docVec);
                        }
                        else
                        {
                            unlabeled++;
                            //
                        }
                    }
                }


                if (i % s == 0)
                {
                    Double r = i.GetRatio(context.spaceModel.documents.Count);
                    log.log("Blending primary vectors [" + r.ToString("P2") + "] : [" + i + "/" + toBlendIntoVectors.Count + "]");
                }
                i++;
            }

            if (constructCategories && (unlabeled > 0))
            {
                log.log("Vectors [" + unlabeled + "] are unlabeled");
            }

            if (constructCategories)
            {
                log.log(":: Creating VectorSpace instances for categories");
                // building category VSM
                foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
                {
                    VectorLabel catVec = new VectorLabel(catModel.name);
                    foreach (var docVec in labelToDocumentSets[catModel.name])
                    {
                        catVec.terms.Merge(docVec.terms);
                    }

                    //= catModel.BlendToVector<VectorLabel>(weightModel, context.spaceModel, FV); //weightModel.GetWeights(FV, catModel, context.spaceModel);

                    context.vectorSpace.labels.Add(catVec);
                }
            }


            if (weightModel != null)
            {
                weightModel.Dispose();
            }
        }
        ///// <summary>
        ///// Transforms to fv dictionary.
        ///// </summary>
        ///// <param name="context">The context.</param>
        ///// <param name="TermWeightModel">The term weight model.</param>
        ///// <param name="function">The function.</param>
        ///// <returns></returns>
        //public static FeatureVectorSetDictionary TransformToFVDictionaryAsPageInCategorySimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        //{
        //    log.log("... Page Similarity ...");

        //    List<string> selectedTerms = context.selectedFeatures.GetKeys();



        //    var ByDomain = context.GetByDomain(log);

        //    Dictionary<string, List<string>> assignIDByLabel = context.featureSpace.labelToDocumentAssociations.GetAllRelationShipByName(true);

        //    var ByCategory = context.GetByAssignIDCategory(assignIDByLabel,log);

        //    Dictionary<String, List<DocumentSelectResultEntry>> EntryByLabel = new Dictionary<string, List<DocumentSelectResultEntry>>();



        //    Dictionary<String, WeightDictionary> documentDictionarties = new Dictionary<string, WeightDictionary>();


        //    foreach (var entry in context.items)
        //    {

        //        WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, entry.spaceDocument, context.spaceModel);
        //        documentDictionarties.Add(entry.AssignedID, documentWeights);
        //    }


        //    FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();



        //    Double total = context.Count;
        //    Int32 i = 0;
        //    Int32 p = (context.Count / 10);

        //    //List<List<Double>> matrix = new List<List<double>>();

        //    //foreach (var entry in context.items)
        //    //{
        //    //    matrix.Add(new List<double>());
        //    //}


        //    //for (int x = 0; x < context.items.Count; x++)
        //    //{

        //    //    for (int y = 0; y < context.items.Count; x++)
        //    //    {



        //    //    }

        //    //}

        //    ConcurrentDictionary<String, Double> computedPairs = new ConcurrentDictionary<string, double>();


        //    foreach (var domainPair in ByCategory)
        //    {
        //        List<DocumentSelectResultEntry> relatives = ByCategory[domainPair.Key].ToList();


        //        foreach (var entry in relatives)
        //        {

        //            i++;
        //            FeatureVector fv = new FeatureVector(entry.AssignedID);

        //            // List<Double> d = new List<>();

        //            fv.dimensions = new double[relatives.Count - 1];


        //            // List<String> keys = documentDictionarties.Keys.ToList();

        //            Int32 hostInd = relatives.IndexOf(entry);

        //            Int32 c = 0;


        //            //foreach (var pair in documentDictionarties)
        //            //{

        //            Parallel.ForEach(relatives, (pair) =>
        //            {

        //                Int32 ind = relatives.IndexOf(pair); // keys.IndexOf(pair.AssignedID);
        //                if (ind >= hostInd)
        //                {
        //                    ind = ind - 1;
        //                }

        //                if (pair.AssignedID != entry.AssignedID)
        //                {
        //                    Double docToClassSimilarity = 0;

        //                    if (computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[entry.AssignedID + pair.AssignedID];
        //                    }
        //                    else if (computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                    {
        //                        docToClassSimilarity = computedPairs[pair.AssignedID + entry.AssignedID];
        //                    }
        //                    else
        //                    {
        //                        var vecA = documentDictionarties[pair.AssignedID];
        //                        var vecB = documentDictionarties[entry.AssignedID];
        //                        docToClassSimilarity = function.ComputeSimilarity(vecA, vecB);
        //                        if (docToClassSimilarity > 0)
        //                        {

        //                        }
        //                        if (!computedPairs.ContainsKey(entry.AssignedID + pair.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(entry.AssignedID + pair.AssignedID, docToClassSimilarity);
        //                            //computedPairs.AddOrUpdate(entry.AssignedID + pair.Key, docToClassSimilarity);
        //                        }
        //                        else if (!computedPairs.ContainsKey(pair.AssignedID + entry.AssignedID))
        //                        {
        //                            computedPairs.GetOrAdd(pair.AssignedID + entry.AssignedID, docToClassSimilarity);
        //                        }

        //                    }

        //                    fv.dimensions[ind] = docToClassSimilarity;

        //                }
        //            });



        //            Int32 r = i % p;
        //            if (r == 0)
        //            {
        //                log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
        //            }


        //            dict.GetOrAdd(entry.DomainID).Add(fv, -1);
        //        }



        //    }


        //    //foreach (KeyValuePair<string, FeatureVectorWithLabelIDSet> pair in dict)
        //    //{
        //    //    pair.Value.CloseDeploy();
        //    //}

        //    log.log("... Preparation finished ...");

        //    return dict;


        //}



        /// <summary>
        /// Transforms to fv dictionary.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="TermWeightModel">The term weight model.</param>
        /// <param name="function">The function.</param>
        /// <returns></returns>
        public static FeatureVectorSetDictionary TransformToFVDictionaryAsSiteSimilarity(this DocumentSelectResult context, FeatureWeightModel TermWeightModel, IVectorSimilarityFunction function, ILogBuilder log)
        {
            log.log("... Site Similarity ...");

            List <string> selectedTerms = context.selectedFeatures.GetKeys(); //.entries.Select(x => x.name)?.ToList();

            Dictionary <String, WeightDictionary> categoryDictionarties = new Dictionary <string, WeightDictionary>();
            Dictionary <String, WeightDictionary> documentDictionarties = new Dictionary <string, WeightDictionary>();

            var byDomain = context.GetByDomain(log);

            FeatureVectorSetDictionary dict = new FeatureVectorSetDictionary();


            Double total = context.Count;
            Int32  i     = 0;
            Int32  p     = (context.Count / 10);

            foreach (var pair in byDomain)
            {
                i++;
                SpaceDocumentModel siteModel = new SpaceDocumentModel();

                foreach (var ent in pair.Value)
                {
                    WeightDictionary documentWeights = TermWeightModel.GetWeights(selectedTerms, ent.spaceDocument, context.spaceModel);
                    documentDictionarties.Add(ent.AssignedID, documentWeights);
                    siteModel.Children.Add(ent.spaceDocument);

                    //siteModel.terms.MergeDictionary(ent.spaceDocument.terms);
                }

                siteModel.Flatten(false);

                categoryDictionarties.Add(pair.Key, TermWeightModel.GetWeights(selectedTerms, siteModel, context.spaceModel));


                foreach (var ent in pair.Value)
                {
                    FeatureVector fv = new FeatureVector(ent.AssignedID);
                    fv.dimensions = new double[context.spaceModel.labels.Count];

                    // documentDictionarties[ent.AssignedID].entries


                    var docToClassSimilarity = function.ComputeSimilarity(categoryDictionarties[pair.Key], documentDictionarties[ent.AssignedID]);

                    fv.dimensions[0] = docToClassSimilarity;

                    dict.GetOrAdd(pair.Key).Add(fv, -1);
                }

                Int32 r = i % p;
                if (r == 0)
                {
                    log.Append(" [" + i.GetRatio(context.Count).ToString("P2") + "] ");
                }
            }



            foreach (KeyValuePair <string, FeatureVectorWithLabelIDSet> pair in dict)
            {
                pair.Value.CloseDeploy();
            }

            log.log("... Preparation finished ...");

            return(dict);
        }
Exemplo n.º 28
0
        /// <summary>
        /// Processes the specified source.
        /// </summary>
        /// <param name="source">The source.</param>
        /// <param name="document_level">The document level.</param>
        /// <param name="table">The table.</param>
        /// <param name="parser">The parser.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null)
        {
            if (counter == null)
            {
                counter = prepareCounter(source);
            }


            List <String> tfdfList = counter.GetIndexForms();

            tfdfList.Sort(String.CompareOrdinal);


            Int32 i     = 0;
            Int32 c     = 0;
            Int32 li    = 0;
            Int32 limit = tfdfList.Count() + 500;


            List <webLemmaTerm> lemmas = new List <webLemmaTerm>();
            Int32 startIndex           = tfdfList.Count();
            Int32 cycleLength          = startIndex / 5;

            foreach (String term in tfdfList)
            {
                if (term != null)
                {
                    List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>();
                    List <imbMCDocumentElement> documents   = new List <imbMCDocumentElement>();

                    Double documentFrequency = 0;
                    Double termFrequency     = 0;

                    TFDFContainer cn = counter.GetContainer(term);


                    webLemmaTerm lemma = new webLemmaTerm();


                    if (cn != null)
                    {
                        lemma.nominalForm = cn.indexForm;
                        lemma.name        = cn.indexForm;

                        foreach (pipelineTaskSubjectContentToken cntPair in cn.items)
                        {
                            imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>();
                            if (document != null)
                            {
                                documents.AddUnique(document);


                                imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet;
                                if (docSet != null)
                                {
                                    documentSet.AddUnique(docSet);
                                }
                            }
                            termFrequency += 1;
                        }

                        lemma.documentSetFrequency = documentSet.Count;
                        lemma.AFreqPoints          = cn.items.Count();
                        lemma.documentFrequency    = documents.Count;
                        lemma.termFrequency        = termFrequency;
                        lemmas.Add(lemma);
                    }
                    else
                    {
                        //lemma.otherForms.AddUnique(cn.items);
                    }



                    li++;
                    i = i + 1;
                    c = c + 1;


                    if (c > cycleLength)
                    {
                        c = 0;
                        logger.AppendLine();
                        logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ ");
                        logger.AppendLine();
                    }

                    if (li > limit)
                    {
                        logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]");
                        break;
                    }
                }
            }



            recompute(table, logger, forSingleWebSite, lemmas);



            return(table);
        }
Exemplo n.º 29
0
        /// <summary>
        /// Gets the index of the Dice coefficient: number of common ngrams divided by number of n-grams in both sets
        /// </summary>
        /// <param name="ngrams_A">The ngrams a.</param>
        /// <param name="ngrams_b">The ngrams b.</param>
        /// <returns></returns>
        public static Double GetDiceCoefficient(List <String> ngrams_A, List <String> ngrams_b)
        {
            Int32 common = ngrams_A.Count(x => ngrams_b.Contains(x)) * 2;

            return(common.GetRatio(ngrams_A.Count + ngrams_b.Count));
        }
Exemplo n.º 30
0
        /// <summary>
        /// Gets the index of the Dice coefficient: number of common ngrams divided by number of n-grams in both sets
        /// </summary>
        /// <param name="ngrams_A">The ngrams a.</param>
        /// <param name="ngrams_B">The ngrams b.</param>
        /// <returns></returns>
        public Double GetDiceCoefficient(List <T> ngrams_A, List <T> ngrams_B)
        {
            Int32 common = CountContains(ngrams_A, ngrams_B) * 2; //.Count(x => Contains(ngrams_B, x)) * 2;

            return(common.GetRatio(ngrams_A.Count + ngrams_B.Count));
        }