/// <summary>
        /// Spaces the model population.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelPopulation(OperationContext context, ILogBuilder log)
        {
            log.log("Space model population");
            context.stemmContext = new StemmingContext(stemmer);
            context.tokenizer    = tokenizer;

            context.entityMetrics = new Dictionary <String, ContentMetrics>();

            foreach (KeyValuePair <String, TextDocumentSet> pair in context.renderSiteByDomain)
            {
                SpaceLabel spaceLabel = context.spaceLabelsDomains[pair.Key];

                SpaceDocumentModel modelOfSite = new SpaceDocumentModel();
                modelOfSite.name = pair.Key;
                modelOfSite.labels.Add(spaceLabel.name);

                foreach (TextDocumentLayerCollection textLayer in pair.Value)
                {
                    SpaceDocumentModel modelOfPage = new SpaceDocumentModel(textLayer.name);

                    ContentMetrics metrics = null;
                    if (DoKeepContentMetrics)
                    {
                        metrics = new ContentMetrics(textLayer.name);
                    }

                    foreach (var renderLayer in textLayer)
                    {
                        SpaceDocumentModel modelOfLayer = new SpaceDocumentModel(modelOfPage.name + renderLayer.name);

                        modelOfLayer = spaceConstructor.ConstructDocument(renderLayer.content, modelOfPage.name + renderLayer.name,
                                                                          context.spaceModel, context.stemmContext, tokenizer,
                                                                          spaceLabel.name != SpaceLabel.UNKNOWN, metrics);

                        modelOfLayer.weight = renderLayer.layerWeight;

                        modelOfLayer.documentScope = DocumentBlenderFunctionOptions.layerLevel;

                        modelOfPage.Children.Add(modelOfLayer);
                    }

                    modelOfPage.documentScope = DocumentBlenderFunctionOptions.pageLevel;

                    if (DoKeepContentMetrics)
                    {
                        context.entityMetrics.Add(metrics.Name, metrics);
                    }

                    // modelOfPage.Flatten(false);

                    modelOfSite.Children.Add(modelOfPage);
                }

                modelOfSite.documentScope = DocumentBlenderFunctionOptions.siteLevel;

                context.spaceModel.documents.Add(modelOfSite);

                foreach (String label in modelOfSite.labels)
                {
                    SpaceLabel sLabel = null;
                    sLabel = context.spaceLabels[label];
                    context.spaceModel.LabelToDocumentLinks.Add(sLabel, modelOfSite, 1);
                }

                modelOfSite.Flatten(false);

                /*
                 * if (modelOfSite.labels.Contains(SpaceLabel.UNKNOWN))
                 * {
                 *  context.spaceModel.terms_unknown_label.MergeDictionary(modelOfSite.terms);
                 * }
                 * else
                 * {
                 *  context.spaceModel.terms_known_label.MergeDictionary(modelOfSite.terms);
                 * }*/

                modelOfSite.PropagateLabels();

                //    modelOfSite.SetLabel(spaceLabel, context.spaceModel);

                //context.spaceModel.LabelToDocumentLinks.Add(spaceLabel, modelOfSite, 1.0);
            }

            log.log("Space model -- documents created [" + context.spaceModel.documents.Count + "]");
        }
Exemplo n.º 2
0
        /// <summary>
        /// Constructs a document model
        /// </summary>
        /// <param name="text">The text.</param>
        /// <param name="name">The name.</param>
        /// <param name="context">The context.</param>
        /// <param name="stemmContext">The stemm context.</param>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <param name="metrics">The metrics.</param>
        /// <returns></returns>
        public SpaceDocumentModel ConstructDocument(string text, String name, SpaceModel context, StemmingContext stemmContext, ITokenizer tokenizer, Boolean isKnownDocument, ContentMetrics metrics = null)
        {
            var tokens = tokenizer.Tokenize(text);

            if (metrics != null)
            {
                metrics.TokensDoc += tokens.Length;                                // <----- token length
            }
            TokenDictionary tokenDictionary = new TokenDictionary(tokens);


            if (metrics != null)
            {
                metrics.UniqueTokensDoc += tokenDictionary.Count;                  // <---- unique tokens
            }
            TokenDictionary stemmDictionary = new TokenDictionary();

            List <String> tkn = tokenDictionary.GetTokens();

            for (int i2 = 0; i2 < tkn.Count; i2++)
            {
                String stk = stemmContext.Stem(tkn[i2]);
                stemmDictionary.CountToken(stk, tokenDictionary.GetTokenFrequency(tkn[i2]));
            }

            //  context.terms.MergeDictionary(stemmDictionary);

            if (metrics != null)
            {
                metrics.StemmedTokensDoc += stemmDictionary.Count;                  // <---- stemmed
            }
            SpaceDocumentModel document = new SpaceDocumentModel();

            document.name   = name;
            document.terms  = stemmDictionary;
            document.Length = tokens.Length;

            if (spaceSettings.DoMaintainWordIndex)
            {
                document.Words = new int[document.Length];
            }

            Int32 c = 0;

            for (int i = 0; i < tokens.Length; i++)
            {
                String stk = stemmContext.Stem(tokens[i]);

                if (isKnownDocument)
                {
                    context.terms_known_label.AddToken(stk);
                }
                else
                {
                    context.terms_unknown_label.AddToken(stk);
                }

                if (spaceSettings.DoMaintainWordIndex)
                {
                    document.Words[c] = context.terms.GetTokenID(stk);
                }
                c++;
            }

            document.name = name;

            // context.documents.Add(document);



            return(document);
        }