コード例 #1
0
        /// <summary>
        /// Texts the rendering.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void TextRendering(OperationContext context, ILogBuilder log, Boolean EnableRendering = true)
        {
            log.log("Text rendering");
            foreach (KeyValuePair <string, WebSiteDocumentsSet> pair in context.dataset)
            {
                foreach (WebSiteDocuments site in pair.Value)
                {
                    TextDocumentSet tds = null;

                    if (DoUseCache)
                    {
                        if (CacheProvider.IsReady)
                        {
                            tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, context.dataSetSignature, site.domain);
                        }
                    }

                    if (tds == null)
                    {
                        tds = render.RenderSiteDocuments(site, log, EnableRendering);
                    }

                    tds.name = site.domain;

                    context.renderSiteByDomain.Add(tds.name, tds);
                    foreach (var td in tds)
                    {
                        context.renderLayersByAssignedID.Add(td.name, td);
                    }

                    if (DoUseCache)
                    {
                        if (CacheProvider.IsReady)
                        {
                            CacheProvider.SetCached(setupSignature, context.dataSetSignature, tds.name, tds);
                        }
                    }
                }
            }
        }
コード例 #2
0
        /// <summary>
        /// Blends all pages into one text document.
        /// </summary>
        /// <param name="entityTexts">The entity texts.</param>
        /// <returns></returns>
        public TextDocument blendToTextDocument(TextDocumentSet entityTexts)
        {
            List <String> units = new List <string>();

            // break down
            foreach (TextDocumentLayerCollection entityText in entityTexts)
            {
                units.AddRange(breakToContentUnits(entityText, options));
            }

            // filter for unique
            if (options.HasFlag(DocumentBlenderFunctionOptions.uniqueContentUnitsOnly))
            {
                units = FilterUnits(units);
            }

            TextDocument output = new TextDocument(JoinUnits(units));

            output.name = entityTexts.name;

            return(output);
        }
コード例 #3
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            if (notes != null)
            {
                notes.logStartPhase("[1] Entity Plane - execution", "");
            }

            IEntityPlaneContext context       = inputContext as IEntityPlaneContext;
            CorpusPlaneContext  outputContext = new CorpusPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            outputContext.dataset = context.dataset;

            // ---------------- rendering procedure
            Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >();
            Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>();

            Dictionary <WebSiteDocuments, TextDocumentSet>    sitesToRenders    = new Dictionary <WebSiteDocuments, TextDocumentSet>();
            Dictionary <String, WebSiteDocuments>             inputSites        = new Dictionary <string, WebSiteDocuments>();
            Dictionary <String, TextDocumentSet>              inputTextRenders  = new Dictionary <string, TextDocumentSet>();
            Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >();

            Int32 c = 0;

            // rendering
            foreach (WebSiteDocumentsSet docSet in context.dataset)
            {
                if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN)
                {
                    outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN);
                    labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown);
                }
                else
                {
                    SpaceLabel lab = new SpaceLabel(docSet.name);
                    labels.Add(lab.name, lab);
                    outputContext.space.labels.Add(lab);
                }

                String datasetSignature = context.dataset.GetDataSetSignature();

                // ---- render
                List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();

                if (CacheProvider.IsReady)
                {
                    foreach (WebSiteDocuments site in docSet)
                    {
                        TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain);

                        if (tds == null)
                        {
                            tds = render.RenderSiteDocuments(site, logger);
                            CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds);
                        }
                        else
                        {
                            tds.name = site.domain;
                        }


                        textSetForLabel.Add(tds);
                    }
                }
                else
                {
                    textSetForLabel = render.RenderDocumentSet(docSet, logger);
                    foreach (TextDocumentSet ws in textSetForLabel)
                    {
                        CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws);
                    }
                }

                // // <--- performs the rendering

                textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x));
                // --- rest of indexing

                docSet.ForEach(x => inputSites.Add(x.domain, x));
                renderIndex.Add(docSet, textSetForLabel);


                foreach (WebSiteDocuments site in docSet)
                {
                    inputSiteVsLabels.Add(site, new List <SpaceLabel>());
                    inputSiteVsLabels[site].Add(labels[docSet.name]);
                    c++;
                }
            }

            if (notes != null)
            {
                notes.log("Text document for [" + c + "] entities created");
            }

            // tmp index
            foreach (String key in inputSites.Keys)
            {
                sitesToRenders.Add(inputSites[key], inputTextRenders[key]);
            }

            // page in site filtering
            if (filter.IsEnabled)
            {
                Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>();

                filter.Learn(inputTextRenders.Values);

                foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders)
                {
                    renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value));
                }
                sitesToRenders = renderIndexFiltered;
            }


            Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>();

            foreach (var pair in sitesToRenders)
            {
                TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value);
            }



            // blending pages into single page per web site
            //  DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages);

            Boolean keepSeparated = blender.DoKeepPagesSeparated;

            foreach (var pair in renderIndex)
            {
                foreach (TextDocumentSet entitySet in pair.Value)
                {
                    TextDocumentSet      selectedTexts = TextDocumentsByDomainName[entitySet.name];
                    WebSiteDocuments     web           = inputSites[entitySet.name];
                    IEnumerable <string> label         = inputSiteVsLabels[web].Select(x => x.name);

                    if (keepSeparated)
                    {
                        // filter function
                        TextDocument doc = blender.blendToTextDocument(selectedTexts);
                        doc.labels.AddRange(label);
                        outputContext.corpus_documents.Add(doc);
                    }
                    else
                    {
                        var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts);
                        foreach (TextDocument doc in docs)
                        {
                            doc.labels.AddRange(label);
                            outputContext.corpus_documents.Add(doc);
                        }
                    }
                }
            }

            if (notes != null)
            {
                notes.logEndPhase();
            }


            return(outputContext);
        }