/// <summary> /// Spaces the model population. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void SpaceModelPopulation(OperationContext context, ILogBuilder log) { log.log("Space model population"); context.stemmContext = new StemmingContext(stemmer); context.tokenizer = tokenizer; context.entityMetrics = new Dictionary <String, ContentMetrics>(); foreach (KeyValuePair <String, TextDocumentSet> pair in context.renderSiteByDomain) { SpaceLabel spaceLabel = context.spaceLabelsDomains[pair.Key]; SpaceDocumentModel modelOfSite = new SpaceDocumentModel(); modelOfSite.name = pair.Key; modelOfSite.labels.Add(spaceLabel.name); foreach (TextDocumentLayerCollection textLayer in pair.Value) { SpaceDocumentModel modelOfPage = new SpaceDocumentModel(textLayer.name); ContentMetrics metrics = null; if (DoKeepContentMetrics) { metrics = new ContentMetrics(textLayer.name); } foreach (var renderLayer in textLayer) { SpaceDocumentModel modelOfLayer = new SpaceDocumentModel(modelOfPage.name + renderLayer.name); modelOfLayer = spaceConstructor.ConstructDocument(renderLayer.content, modelOfPage.name + renderLayer.name, context.spaceModel, context.stemmContext, tokenizer, spaceLabel.name != SpaceLabel.UNKNOWN, metrics); modelOfLayer.weight = renderLayer.layerWeight; modelOfLayer.documentScope = DocumentBlenderFunctionOptions.layerLevel; modelOfPage.Children.Add(modelOfLayer); } modelOfPage.documentScope = DocumentBlenderFunctionOptions.pageLevel; if (DoKeepContentMetrics) { context.entityMetrics.Add(metrics.Name, metrics); } // modelOfPage.Flatten(false); modelOfSite.Children.Add(modelOfPage); } modelOfSite.documentScope = DocumentBlenderFunctionOptions.siteLevel; context.spaceModel.documents.Add(modelOfSite); foreach (String label in modelOfSite.labels) { SpaceLabel sLabel = null; sLabel = context.spaceLabels[label]; context.spaceModel.LabelToDocumentLinks.Add(sLabel, modelOfSite, 1); } modelOfSite.Flatten(false); /* * if (modelOfSite.labels.Contains(SpaceLabel.UNKNOWN)) * { * context.spaceModel.terms_unknown_label.MergeDictionary(modelOfSite.terms); * } * else * { * context.spaceModel.terms_known_label.MergeDictionary(modelOfSite.terms); * }*/ modelOfSite.PropagateLabels(); // modelOfSite.SetLabel(spaceLabel, context.spaceModel); //context.spaceModel.LabelToDocumentLinks.Add(spaceLabel, modelOfSite, 1.0); } log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]"); }
/// <summary> /// Constructs a document model /// </summary> /// <param name="text">The text.</param> /// <param name="name">The name.</param> /// <param name="context">The context.</param> /// <param name="stemmContext">The stemm context.</param> /// <param name="tokenizer">The tokenizer.</param> /// <param name="metrics">The metrics.</param> /// <returns></returns> public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null) { var tokens = tokenizer.Tokenize(text); if (metrics != null) { metrics.TokensDoc += tokens.Length; // <----- token length } TokenDictionary tokenDictionary = new TokenDictionary(tokens); if (metrics != null) { metrics.UniqueTokensDoc += tokenDictionary.Count; // <---- unique tokens } TokenDictionary stemmDictionary = new TokenDictionary(); List <String> tkn = tokenDictionary.GetTokens(); for (int i2 = 0; i2 < tkn.Count; i2++) { String stk = stemmContext.Stem(tkn[i2]); stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2])); } // context.terms.MergeDictionary(stemmDictionary); if (metrics != null) { metrics.StemmedTokensDoc += stemmDictionary.Count; // <---- stemmed } SpaceDocumentModel document = new SpaceDocumentModel(); document.name = name; document.terms = stemmDictionary; document.Length = tokens.Length; if (spaceSettings.DoMaintainWordIndex) { document.Words = new int[document.Length]; } Int32 c = 0; for (int i = 0; i < tokens.Length; i++) { String stk = stemmContext.Stem(tokens[i]); if (isKnownDocument) { context.terms_known_label.AddToken(stk); } else { context.terms_unknown_label.AddToken(stk); } if (spaceSettings.DoMaintainWordIndex) { document.Words[c] = context.terms.GetTokenID(stk); } c++; } document.name = name; // context.documents.Add(document); return(document); }