/// <summary> /// Filters the space model features. /// </summary> /// <param name="spaceModel">The space model.</param> /// <param name="selectedFeatures">The selected features.</param> /// <param name="log">The log.</param> /// <returns></returns> public static Int32 FilterSpaceModelFeatures(this SpaceModel spaceModel, WeightDictionary selectedFeatures, ILogBuilder log) { Int32 i = 0; Int32 s = spaceModel.documents.Count() / 5; Int32 c_filter_out = 0; List <String> keys = selectedFeatures.GetKeys(); List <String> termsToRemove = spaceModel.terms.GetTokensOtherThan(keys); for (int i2 = 0; i2 < spaceModel.documents.Count; i2++) { c_filter_out += spaceModel.documents[i2].FilterSelectedFeatures(termsToRemove, false); if (i > s) { Double r = i2.GetRatio(spaceModel.documents.Count()); log.log("Filter SelectedFeatures [" + r.ToString("P2") + "]"); i = 0; } i++; } spaceModel.terms_known_label.FilterTokens(termsToRemove, false); spaceModel.terms_unknown_label.FilterTokens(termsToRemove, false); return(c_filter_out); }
/// <summary> /// Adds the label if not already declared within the context /// </summary> /// <param name="text">The text.</param> /// <param name="context">The context.</param> public void AddLabel(String text, SpaceModel context) { if (!context.labels.Any(x => x.name == text)) { SpaceLabel label = new SpaceLabel(); label.name = text; context.labels.Add(label); } }
/// <summary> /// Gets the label instances for given list of label names /// </summary> /// <param name="labelIds">The label ids.</param> /// <param name="context">The context.</param> /// <returns></returns> public List <SpaceLabel> GetLabels(IEnumerable <String> labelIds, SpaceModel context) { List <SpaceLabel> output = new List <SpaceLabel>(); foreach (String lbl in labelIds) { var tmp = context.labels.FirstOrDefault(x => x.name == lbl); if (tmp != null) { output.Add(tmp); } } return(output); }
/// <summary> /// /// </summary> /// <param name="model">The model.</param> /// <param name="labeled">if set to <c>true</c> [labeled].</param> /// <param name="unlabeled">if set to <c>true</c> [unlabeled].</param> /// <returns></returns> public static TokenDictionary GetTerms(this SpaceModel model, Boolean labeled, Boolean unlabeled) { TokenDictionary output = new TokenDictionary(); if (labeled) { output.MergeDictionary(model.terms_known_label); } if (unlabeled) { output.MergeDictionary(model.terms_unknown_label); } return(output); }
/// <summary> /// Sets the label. /// </summary> /// <param name="model">The model.</param> /// <param name="label">The label.</param> /// <param name="space">The space.</param> public static void SetLabel(this SpaceDocumentModel model, SpaceLabel label, SpaceModel space) { space.LabelToDocumentLinks.Add(label, model, 1.0); var ms = model.Children.ToList(); while (ms.Any()) { List <SpaceDocumentModel> nestMs = new List <SpaceDocumentModel>(); foreach (var m in ms) { nestMs.AddRange(m.Children); space.LabelToDocumentLinks.Add(label, m, 1.0); } ms = nestMs; } }
/// <summary> /// /// </summary> /// <param name="model">The model.</param> /// <param name="labeled">if set to <c>true</c> [labeled].</param> /// <param name="unlabeled">if set to <c>true</c> [unlabeled].</param> /// <returns></returns> public static List <String> GetTokens(this SpaceModel model, Boolean labeled, Boolean unlabeled) { List <String> tokens = new List <string>(); if (labeled) { tokens.AddRange(model.terms_known_label.GetTokens(), true); } if (unlabeled) { tokens.AddRange(model.terms_unknown_label.GetTokens(), true); } return(tokens); }
public SpaceModel Clone() { SpaceModel output = new SpaceModel(); output.terms = terms.Clone(); output.terms_known_label = terms_known_label.Clone(); output.terms_unknown_label = terms_unknown_label.Clone(); output.documents = documents.Clone <SpaceDocumentModel>(false); output.categories = categories.Clone <SpaceCategoryModel>(false); output.topics = topics.CloneTerm <SpaceTopic>(); output.labels = labels.CloneTerm <SpaceLabel>(); foreach (var label in labels) { var linked = LabelToDocumentLinks.GetAllLinked(label); var label2 = output.labels.First(x => x.name == label.name); foreach (var l in linked) { SpaceDocumentModel doc2 = output.documents.First(x => x.name == l.name); output.LabelToDocumentLinks.Add(label2, doc2, l.weight); } } foreach (var label in labels) { var linked = LabelToCategoryLinks.GetAllLinked(label); var label2 = output.labels.First(x => x.name == label.name); foreach (var l in linked) { SpaceCategoryModel doc2 = output.categories.First(x => x.name == l.name); output.LabelToCategoryLinks.Add(label2, doc2, l.weight); } } return(output); //output.categories }
public static List <SpaceDocumentModel> GetDocumentsOfLabel(this SpaceModel model, String labelName) { List <SpaceDocumentModel> output = model.documents.Where(x => x.labels.Contains(labelName)).ToList(); return(output); }
/// <summary> /// Gets the documents by label. /// </summary> /// <param name="model">The model.</param> /// <param name="includeUnlabeled">if set to <c>true</c> [include unlabeled].</param> /// <returns></returns> public static Dictionary <String, List <SpaceDocumentModel> > GetDocumentsByLabel(this SpaceModel model, Boolean includeUnlabeled = false) { var labels = model.labels.Select(x => x.name).ToList(); if (!includeUnlabeled) { labels.Remove(SpaceLabel.UNKNOWN); } Dictionary <String, List <SpaceDocumentModel> > output = new Dictionary <string, List <SpaceDocumentModel> >(); foreach (String l in labels) { output.Add(l, model.documents.Where(x => x.labels.Contains(l)).ToList()); } return(output); }
/// <summary> /// Constructs a document model /// </summary> /// <param name="text">The text.</param> /// <param name="name">The name.</param> /// <param name="context">The context.</param> /// <param name="stemmContext">The stemm context.</param> /// <param name="tokenizer">The tokenizer.</param> /// <param name="metrics">The metrics.</param> /// <returns></returns> public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null) { var tokens = tokenizer.Tokenize(text); if (metrics != null) { metrics.TokensDoc += tokens.Length; // <----- token length } TokenDictionary tokenDictionary = new TokenDictionary(tokens); if (metrics != null) { metrics.UniqueTokensDoc += tokenDictionary.Count; // <---- unique tokens } TokenDictionary stemmDictionary = new TokenDictionary(); List <String> tkn = tokenDictionary.GetTokens(); for (int i2 = 0; i2 < tkn.Count; i2++) { String stk = stemmContext.Stem(tkn[i2]); stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2])); } // context.terms.MergeDictionary(stemmDictionary); if (metrics != null) { metrics.StemmedTokensDoc += stemmDictionary.Count; // <---- stemmed } SpaceDocumentModel document = new SpaceDocumentModel(); document.name = name; document.terms = stemmDictionary; document.Length = tokens.Length; if (spaceSettings.DoMaintainWordIndex) { document.Words = new int[document.Length]; } Int32 c = 0; for (int i = 0; i < tokens.Length; i++) { String stk = stemmContext.Stem(tokens[i]); if (isKnownDocument) { context.terms_known_label.AddToken(stk); } else { context.terms_unknown_label.AddToken(stk); } if (spaceSettings.DoMaintainWordIndex) { document.Words[c] = context.terms.GetTokenID(stk); } c++; } document.name = name; // context.documents.Add(document); return(document); }