/// <summary>
        /// Removes all features other than specirfi
        /// </summary>
        /// <param name="selectedFeatures">Tokens to filter</param>
        /// <param name="inverseFilter">if set to <c>true</c> it will remove all other than specified.</param>
        /// <returns>Number of distinct tokens removed from this model and children nodes</returns>
        public Int32 FilterSelectedFeatures(List <String> keys, Boolean inverseFilter = true, Boolean filterFirstLine = true)
        {
            SpaceDocumentModel output = null;

            Int32 ca = 0;
            Int32 cb = 0;

            List <SpaceDocumentModel> iteration = new List <SpaceDocumentModel>();

            if (filterFirstLine)
            {
                iteration.Add(this);
                iteration.AddRange(Children);
                //terms.FilterTokens(keys);
            }
            else
            {
                iteration = GetLeafs(true);
            }



            for (int i = 0; i < iteration.Count; i++)
            {
                ca += iteration[i].terms.FilterTokens(keys, inverseFilter);
            }


            //Parallel.ForEach<SpaceDocumentModel,Int32>(toProcess,
            //    model => // method invoked by the loop on each iteration
            //                         {
            //                    Int32 subtotal = model.terms.FilterTokens(keys, inverseFilter);  //modify local variable
            //                             return subtotal; // value to be passed to next iteration
            //                         },
            //     () => 0,
            //                // Method to be executed when each partition has completed.
            //                // finalResult is the final value of subtotal for a particular partition.
            //                (finalResult) => Interlocked.Add(ref ca, finalResult)
            //                );

            //Parallel.ForEach(toProcess, model =>
            //{
            //    model.terms.FilterTokens(keys, inverseFilter);
            //});
            //Parallel.ForEach<SpaceDocumentModel, Int32>(toProcess,
            //    () => 0,
            //    (model, loop, subtotal) =>
            //    {
            //        subtotal += model.terms.FilterTokens(keys, inverseFilter);
            //        return subtotal;
            //    },
            //    (finalResult) => Interlocked.Add(ref ca, finalResult)
            //);


            return(ca);
        }
        /// <summary>
        /// Sets the label.
        /// </summary>
        /// <param name="model">The model.</param>
        /// <param name="label">The label.</param>
        /// <param name="space">The space.</param>
        public static void SetLabel(this SpaceDocumentModel model, SpaceLabel label, SpaceModel space)
        {
            space.LabelToDocumentLinks.Add(label, model, 1.0);

            var ms = model.Children.ToList();

            while (ms.Any())
            {
                List <SpaceDocumentModel> nestMs = new List <SpaceDocumentModel>();
                foreach (var m in ms)
                {
                    nestMs.AddRange(m.Children);
                    space.LabelToDocumentLinks.Add(label, m, 1.0);
                }
                ms = nestMs;
            }
        }
Esempio n. 3
0
        public SpaceModel Clone()
        {
            SpaceModel output = new SpaceModel();

            output.terms               = terms.Clone();
            output.terms_known_label   = terms_known_label.Clone();
            output.terms_unknown_label = terms_unknown_label.Clone();
            output.documents           = documents.Clone <SpaceDocumentModel>(false);
            output.categories          = categories.Clone <SpaceCategoryModel>(false);
            output.topics              = topics.CloneTerm <SpaceTopic>();
            output.labels              = labels.CloneTerm <SpaceLabel>();

            foreach (var label in labels)
            {
                var linked = LabelToDocumentLinks.GetAllLinked(label);
                var label2 = output.labels.First(x => x.name == label.name);
                foreach (var l in linked)
                {
                    SpaceDocumentModel doc2 = output.documents.First(x => x.name == l.name);

                    output.LabelToDocumentLinks.Add(label2, doc2, l.weight);
                }
            }

            foreach (var label in labels)
            {
                var linked = LabelToCategoryLinks.GetAllLinked(label);
                var label2 = output.labels.First(x => x.name == label.name);
                foreach (var l in linked)
                {
                    SpaceCategoryModel doc2 = output.categories.First(x => x.name == l.name);

                    output.LabelToCategoryLinks.Add(label2, doc2, l.weight);
                }
            }



            return(output);
            //output.categories
        }
Esempio n. 4
0
        public void LearnFrom(SpaceDocumentModel learnFrom, ILogBuilder log, Boolean learnCompleteTreeStructure)
        {
            name          = learnFrom.name;
            documentScope = learnFrom.documentScope;

            foreach (SpaceDocumentModel m in learnFrom.GetLeafs())
            {
                terms.MergeDictionary(m.terms);
                termsChildCount.CountTokens(m.terms.GetTokens());
            }

            foreach (SpaceDocumentModel m in learnFrom.Children)
            {
                SpaceDocumentStatsModel statChild = new SpaceDocumentStatsModel(m.name, log);

                if (learnCompleteTreeStructure)
                {
                    statChild.LearnFrom(m, log, learnCompleteTreeStructure);
                }

                Children.Add(statChild);
            }
        }
Esempio n. 5
0
        /// <summary>
        /// Constructs a document model
        /// </summary>
        /// <param name="text">The text.</param>
        /// <param name="name">The name.</param>
        /// <param name="context">The context.</param>
        /// <param name="stemmContext">The stemm context.</param>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <param name="metrics">The metrics.</param>
        /// <returns></returns>
        public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null)
        {
            var tokens = tokenizer.Tokenize(text);

            if (metrics != null)
            {
                metrics.TokensDoc += tokens.Length;                                // <----- token length
            }
            TokenDictionary tokenDictionary = new TokenDictionary(tokens);


            if (metrics != null)
            {
                metrics.UniqueTokensDoc += tokenDictionary.Count;                  // <---- unique tokens
            }
            TokenDictionary stemmDictionary = new TokenDictionary();

            List <String> tkn = tokenDictionary.GetTokens();

            for (int i2 = 0; i2 < tkn.Count; i2++)
            {
                String stk = stemmContext.Stem(tkn[i2]);
                stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2]));
            }

            //  context.terms.MergeDictionary(stemmDictionary);

            if (metrics != null)
            {
                metrics.StemmedTokensDoc += stemmDictionary.Count;                  // <---- stemmed
            }
            SpaceDocumentModel document = new SpaceDocumentModel();

            document.name   = name;
            document.terms  = stemmDictionary;
            document.Length = tokens.Length;

            if (spaceSettings.DoMaintainWordIndex)
            {
                document.Words = new int[document.Length];
            }

            Int32 c = 0;

            for (int i = 0; i < tokens.Length; i++)
            {
                String stk = stemmContext.Stem(tokens[i]);

                if (isKnownDocument)
                {
                    context.terms_known_label.AddToken(stk);
                }
                else
                {
                    context.terms_unknown_label.AddToken(stk);
                }

                if (spaceSettings.DoMaintainWordIndex)
                {
                    document.Words[c] = context.terms.GetTokenID(stk);
                }
                c++;
            }

            document.name = name;

            // context.documents.Add(document);



            return(document);
        }
        /// <summary>
        /// Projects all term counts and word indexes from children to this instance or new instance
        /// </summary>
        /// <param name="newInstance">if set to <c>true</c> [new instance].</param>
        /// <returns></returns>
        public SpaceDocumentModel Flatten(Boolean newInstance = true)
        {
            SpaceDocumentModel output = null;

            List <SpaceDocumentModel> iteration = new List <SpaceDocumentModel>();


            if (!newInstance)
            {
                output = this;
            }
            else
            {
                output = new SpaceDocumentModel();
                iteration.Add(this);

                output.type   = type;
                output.name   = name;
                output.weight = weight;
            }


            List <List <String> > wordIndexes = new List <List <string> >();
            TokenDictionary       new_terms   = new TokenDictionary();

            iteration.AddRange(Children);

            while (iteration.Any())
            {
                List <SpaceDocumentModel> nextIteration = new List <SpaceDocumentModel>();

                for (int i = 0; i < iteration.Count; i++)
                {
                    if (SpaceModelConstructor.spaceSettings.DoMaintainWordIndex)
                    {
                        wordIndexes.Add(iteration[i].GetWordIndexed());
                    }
                    new_terms.MergeDictionary(iteration[i].terms);

                    if (iteration[i].terms.Count == 0)
                    {
                        nextIteration.AddRange(iteration[i].Children);
                    }
                }


                iteration = nextIteration;
            }

            List <Int32> wsum = new List <int>();

            for (int i2 = 0; i2 < wordIndexes.Count; i2++)
            {
                wsum.AddRange(new_terms.GetIDsByTokens(wordIndexes[i2]));
            }

            output.terms = new_terms;
            if (SpaceModelConstructor.spaceSettings.DoMaintainWordIndex)
            {
                output.Words = wsum.ToArray();
                length       = wsum.Count;
            }

            return(output);
            //output.terms.MergeDictionary(GetTerms(false, true));
        }