/// <summary> /// Removes all features other than specirfi /// </summary> /// <param name="selectedFeatures">Tokens to filter</param> /// <param name="inverseFilter">if set to <c>true</c> it will remove all other than specified.</param> /// <returns>Number of distinct tokens removed from this model and children nodes</returns> public Int32 FilterSelectedFeatures(List <String> keys, Boolean inverseFilter = true, Boolean filterFirstLine = true) { SpaceDocumentModel output = null; Int32 ca = 0; Int32 cb = 0; List <SpaceDocumentModel> iteration = new List <SpaceDocumentModel>(); if (filterFirstLine) { iteration.Add(this); iteration.AddRange(Children); //terms.FilterTokens(keys); } else { iteration = GetLeafs(true); } for (int i = 0; i < iteration.Count; i++) { ca += iteration[i].terms.FilterTokens(keys, inverseFilter); } //Parallel.ForEach<SpaceDocumentModel,Int32>(toProcess, // model => // method invoked by the loop on each iteration // { // Int32 subtotal = model.terms.FilterTokens(keys, inverseFilter); //modify local variable // return subtotal; // value to be passed to next iteration // }, // () => 0, // // Method to be executed when each partition has completed. // // finalResult is the final value of subtotal for a particular partition. // (finalResult) => Interlocked.Add(ref ca, finalResult) // ); //Parallel.ForEach(toProcess, model => //{ // model.terms.FilterTokens(keys, inverseFilter); //}); //Parallel.ForEach<SpaceDocumentModel, Int32>(toProcess, // () => 0, // (model, loop, subtotal) => // { // subtotal += model.terms.FilterTokens(keys, inverseFilter); // return subtotal; // }, // (finalResult) => Interlocked.Add(ref ca, finalResult) //); return(ca); }
/// <summary> /// Sets the label. /// </summary> /// <param name="model">The model.</param> /// <param name="label">The label.</param> /// <param name="space">The space.</param> public static void SetLabel(this SpaceDocumentModel model, SpaceLabel label, SpaceModel space) { space.LabelToDocumentLinks.Add(label, model, 1.0); var ms = model.Children.ToList(); while (ms.Any()) { List <SpaceDocumentModel> nestMs = new List <SpaceDocumentModel>(); foreach (var m in ms) { nestMs.AddRange(m.Children); space.LabelToDocumentLinks.Add(label, m, 1.0); } ms = nestMs; } }
public SpaceModel Clone() { SpaceModel output = new SpaceModel(); output.terms = terms.Clone(); output.terms_known_label = terms_known_label.Clone(); output.terms_unknown_label = terms_unknown_label.Clone(); output.documents = documents.Clone <SpaceDocumentModel>(false); output.categories = categories.Clone <SpaceCategoryModel>(false); output.topics = topics.CloneTerm <SpaceTopic>(); output.labels = labels.CloneTerm <SpaceLabel>(); foreach (var label in labels) { var linked = LabelToDocumentLinks.GetAllLinked(label); var label2 = output.labels.First(x => x.name == label.name); foreach (var l in linked) { SpaceDocumentModel doc2 = output.documents.First(x => x.name == l.name); output.LabelToDocumentLinks.Add(label2, doc2, l.weight); } } foreach (var label in labels) { var linked = LabelToCategoryLinks.GetAllLinked(label); var label2 = output.labels.First(x => x.name == label.name); foreach (var l in linked) { SpaceCategoryModel doc2 = output.categories.First(x => x.name == l.name); output.LabelToCategoryLinks.Add(label2, doc2, l.weight); } } return(output); //output.categories }
public void LearnFrom(SpaceDocumentModel learnFrom, ILogBuilder log, Boolean learnCompleteTreeStructure) { name = learnFrom.name; documentScope = learnFrom.documentScope; foreach (SpaceDocumentModel m in learnFrom.GetLeafs()) { terms.MergeDictionary(m.terms); termsChildCount.CountTokens(m.terms.GetTokens()); } foreach (SpaceDocumentModel m in learnFrom.Children) { SpaceDocumentStatsModel statChild = new SpaceDocumentStatsModel(m.name, log); if (learnCompleteTreeStructure) { statChild.LearnFrom(m, log, learnCompleteTreeStructure); } Children.Add(statChild); } }
/// <summary> /// Constructs a document model /// </summary> /// <param name="text">The text.</param> /// <param name="name">The name.</param> /// <param name="context">The context.</param> /// <param name="stemmContext">The stemm context.</param> /// <param name="tokenizer">The tokenizer.</param> /// <param name="metrics">The metrics.</param> /// <returns></returns> public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null) { var tokens = tokenizer.Tokenize(text); if (metrics != null) { metrics.TokensDoc += tokens.Length; // <----- token length } TokenDictionary tokenDictionary = new TokenDictionary(tokens); if (metrics != null) { metrics.UniqueTokensDoc += tokenDictionary.Count; // <---- unique tokens } TokenDictionary stemmDictionary = new TokenDictionary(); List <String> tkn = tokenDictionary.GetTokens(); for (int i2 = 0; i2 < tkn.Count; i2++) { String stk = stemmContext.Stem(tkn[i2]); stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2])); } // context.terms.MergeDictionary(stemmDictionary); if (metrics != null) { metrics.StemmedTokensDoc += stemmDictionary.Count; // <---- stemmed } SpaceDocumentModel document = new SpaceDocumentModel(); document.name = name; document.terms = stemmDictionary; document.Length = tokens.Length; if (spaceSettings.DoMaintainWordIndex) { document.Words = new int[document.Length]; } Int32 c = 0; for (int i = 0; i < tokens.Length; i++) { String stk = stemmContext.Stem(tokens[i]); if (isKnownDocument) { context.terms_known_label.AddToken(stk); } else { context.terms_unknown_label.AddToken(stk); } if (spaceSettings.DoMaintainWordIndex) { document.Words[c] = context.terms.GetTokenID(stk); } c++; } document.name = name; // context.documents.Add(document); return(document); }
/// <summary> /// Projects all term counts and word indexes from children to this instance or new instance /// </summary> /// <param name="newInstance">if set to <c>true</c> [new instance].</param> /// <returns></returns> public SpaceDocumentModel Flatten(Boolean newInstance = true) { SpaceDocumentModel output = null; List <SpaceDocumentModel> iteration = new List <SpaceDocumentModel>(); if (!newInstance) { output = this; } else { output = new SpaceDocumentModel(); iteration.Add(this); output.type = type; output.name = name; output.weight = weight; } List <List <String> > wordIndexes = new List <List <string> >(); TokenDictionary new_terms = new TokenDictionary(); iteration.AddRange(Children); while (iteration.Any()) { List <SpaceDocumentModel> nextIteration = new List <SpaceDocumentModel>(); for (int i = 0; i < iteration.Count; i++) { if (SpaceModelConstructor.spaceSettings.DoMaintainWordIndex) { wordIndexes.Add(iteration[i].GetWordIndexed()); } new_terms.MergeDictionary(iteration[i].terms); if (iteration[i].terms.Count == 0) { nextIteration.AddRange(iteration[i].Children); } } iteration = nextIteration; } List <Int32> wsum = new List <int>(); for (int i2 = 0; i2 < wordIndexes.Count; i2++) { wsum.AddRange(new_terms.GetIDsByTokens(wordIndexes[i2])); } output.terms = new_terms; if (SpaceModelConstructor.spaceSettings.DoMaintainWordIndex) { output.Words = wsum.ToArray(); length = wsum.Count; } return(output); //output.terms.MergeDictionary(GetTerms(false, true)); }