/// <summary>
        /// Renders a web site into set of documents
        /// </summary>
        /// <param name="site">The site.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public TextDocumentSet RenderSiteDocuments(WebSiteDocuments site, ILogBuilder logger, Boolean EnableRendering = true)
        {
            TextDocumentSet textSet = new TextDocumentSet(site.domain);

            //Parallel.ForEach(site.documents, (webPage) =>
            //{
            //    TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering);
            //    pg.name = webPage.AssociatedID;
            //    textSet.Add(pg);
            //});

            foreach (WebSiteDocument webPage in site.documents)
            {
                TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering);

                pg.name = webPage.AssignedID;
                textSet.Add(pg);
            }

            return(textSet);
        }
        /// <summary>
        /// Breaks to content units.
        /// </summary>
        /// <param name="layers">The layers.</param>
        /// <param name="options">The options.</param>
        /// <returns></returns>
        /// <exception cref="NotImplementedException">
        /// </exception>
        public List <String> breakToContentUnits(TextDocumentLayerCollection layers, DocumentBlenderFunctionOptions options)
        {
            List <String> units = new List <string>();

            if (options.HasFlag(DocumentBlenderFunctionOptions.pageLevel))
            {
                units.Add(layers.ToString());
                return(units);
            }
            if (options.HasFlag(DocumentBlenderFunctionOptions.blockLevel))
            {
                throw new NotImplementedException();//units.Add(layers.ToString());
                return(units);
            }
            if (options.HasFlag(DocumentBlenderFunctionOptions.sentenceLevel))
            {
                throw new NotImplementedException();
                //units.Add(layers.ToString());
                return(units);
            }
            return(units);
        }
Example #3
0
 public override double Compute(TextDocumentLayerCollection document, String documentSet)
 {
     return(stats[documentSet][document.name].TotalScore);
 }
 /// <summary>
 /// Computes the specified document.
 /// </summary>
 /// <param name="document">The document.</param>
 /// <param name="documentSet">The document set.</param>
 /// <returns></returns>
 public override double Compute(TextDocumentLayerCollection document, String documentSet)
 {
     return(stats[documentSet][document.name].entropyFreq);
 }
 public override double Compute(TextDocumentLayerCollection document, string parentID)
 {
     throw new NotImplementedException();
 }
        /// <summary>
        /// Renders the provided HTML source
        /// </summary>
        /// <param name="webPage">The web page.</param>
        /// <param name="site">The site.</param>
        /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <returns></returns>
        public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null)
        {
            TextDocumentLayerCollection output = new TextDocumentLayerCollection();

            output.name = webPage.AssignedID;

            if (htmlRelDoc == null)
            {
                htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource);
            }

            foreach (DocumentRenderInstruction instruction in instructions)
            {
                String        content = "";
                StringBuilder sb      = new StringBuilder();

                if (EnableRendering)
                {
                    DocumentRenderInstructionFlags flags = instruction.instructionFlags;

                    if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page))
                    {
                        RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code);
                    }
                    if (flags.HasFlag(DocumentRenderInstructionFlags.select_links))
                    {
                        if (site.extensions.graph == null)
                        {
                            throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]");
                        }

                        imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection =
                            site.extensions.graph.GetLinks(webPage.AssignedID,
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links),
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links));

                        foreach (var link in selection.links)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption))
                            {
                                if (link.linkLabel.isNullOrEmpty())
                                {
                                }
                                else
                                {
                                }
                                sb.AppendLine(link.linkLabel);
                            }
                        }

                        foreach (var node in selection.linkedNodeClones.Values)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page))
                            {
                                WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault();
                                RenderDocumentAspect(sb, doc, null, flags, instruction.code);
                            }
                        }
                    }

                    content = sb.ToString();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case))
                    {
                        content = content.ToLower();
                    }

                    if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens))
                    {
                        List <string> r  = content.getTokens(true, false, true, true, 1);
                        List <String> ur = new List <string>();
                        foreach (String rk in r)
                        {
                            if (!ur.Contains(rk))
                            {
                                ur.Add(rk);
                            }
                        }

                        StringBuilder sbr = new StringBuilder();
                        foreach (String rk in ur)
                        {
                            sbr.Append(rk + " ");
                        }
                        content = sbr.ToString();
                    }

                    content = TrimEmptySpace(content);
                }

                output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight));
            }

            return(output);
        }
Example #7
0
 public abstract Double Compute(TextDocumentLayerCollection document, String parentID);