Beispiel #1
0
        /// <summary>
        /// Instructs HTML to text extraction engine (EntityPlaneMethod) to produce text from xpath
        /// </summary>
        /// <param name="name">Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::</param>
        /// <param name="flags">The flags.</param>
        /// <param name="code">XPath associated with the instruction, selects nodes to be rendered into text</param>
        /// <param name="weight">Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)</param>
        /// <param name="expansion">The expansion.</param>
        /// <param name="remove">if set to <c>true</c> [remove].</param>
        /// <remarks>
        /// It will add specified instruction to the rendering instruction set, and optionally remove all existing instructions before it.
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_setRenderInstruction(
            [Description("Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::")] String name = "::BODYTEXT::",
            [Description("Instruction flags, controls what and how to render")] DocumentRenderInstructionFlags flags   = DocumentRenderInstructionFlags.this_page_content,
            [Description("XPath associated with the instruction, selects nodes to be rendered into text")] String code = "",
            [Description("Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)")] Double weight = 1.0,
            [Description("Graph selection expansion steps - to reach 1+ edges far nodes ")] Int32 expansion = 1,
            [Description("If true it will remove any existing instruction in the set")] Boolean remove      = false)
        {
            if (remove)
            {
                data.instructions.Clear();
            }

            DocumentRenderInstruction dri = new DocumentRenderInstruction(name, code, weight);

            dri.instructionFlags    = flags;
            dri.graphExpansionSteps = expansion;
            dri.weight = weight;
            data.instructions.Add(dri);
        }
        /// <summary>
        /// Renders the document aspect.
        /// </summary>
        /// <param name="sb">The sb.</param>
        /// <param name="doc">The document.</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <param name="flags">The flags.</param>
        /// <exception cref="NotImplementedException"></exception>
        public void RenderDocumentAspect(StringBuilder sb, WebSiteDocument doc, HtmlDocument htmlRelDoc, DocumentRenderInstructionFlags flags, String code = "")
        {
            if (doc != null)
            {
                if (flags.HasFlag(DocumentRenderInstructionFlags.page_title))
                {
                    HtmlNode titleNode = htmlRelDoc.DocumentNode.Descendants("title").FirstOrDefault();
                    if (titleNode != null)
                    {
                        String titleString = titleNode.InnerText;
                        sb.AppendLine(titleString);
                    }

                    //sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_TITLE));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_description))
                {
                    IEnumerable <HtmlNode> metaNodes = htmlRelDoc.DocumentNode.Descendants("meta");
                    String dsc = "";
                    if (metaNodes != null)
                    {
                        foreach (HtmlNode nd in metaNodes)
                        {
                            String metaName = nd.GetAttributeValue("name", "");
                            if (metaName.Equals("description", StringComparison.CurrentCultureIgnoreCase))
                            {
                                dsc += nd.GetAttributeValue("content", "");
                            }
                        }
                    }

                    if (dsc.isNullOrEmpty())
                    {
                    }
                    else
                    {
                        sb.AppendLine(dsc);
                    }

                    // sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_DESCRIPTION));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_content))
                {
                    sb.AppendLine(BodyTextRender(htmlRelDoc, code));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.url_tokens))
                {
                    List <string> r = doc.path.getTokens(true, false, true, true, 1);

                    foreach (String rk in r)
                    {
                        sb.Append(rk + " ");
                    }
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_xpath))
                {
                    if (!code.isNullOrEmpty())
                    {
                        sb.AppendLine(Render(htmlRelDoc.DocumentNode, code));
                    }
                }
            }
        }
        /// <summary>
        /// Renders the provided HTML source
        /// </summary>
        /// <param name="webPage">The web page.</param>
        /// <param name="site">The site.</param>
        /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <returns></returns>
        public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null)
        {
            TextDocumentLayerCollection output = new TextDocumentLayerCollection();

            output.name = webPage.AssignedID;

            if (htmlRelDoc == null)
            {
                htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource);
            }

            foreach (DocumentRenderInstruction instruction in instructions)
            {
                String        content = "";
                StringBuilder sb      = new StringBuilder();

                if (EnableRendering)
                {
                    DocumentRenderInstructionFlags flags = instruction.instructionFlags;

                    if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page))
                    {
                        RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code);
                    }
                    if (flags.HasFlag(DocumentRenderInstructionFlags.select_links))
                    {
                        if (site.extensions.graph == null)
                        {
                            throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]");
                        }

                        imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection =
                            site.extensions.graph.GetLinks(webPage.AssignedID,
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links),
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links));

                        foreach (var link in selection.links)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption))
                            {
                                if (link.linkLabel.isNullOrEmpty())
                                {
                                }
                                else
                                {
                                }
                                sb.AppendLine(link.linkLabel);
                            }
                        }

                        foreach (var node in selection.linkedNodeClones.Values)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page))
                            {
                                WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault();
                                RenderDocumentAspect(sb, doc, null, flags, instruction.code);
                            }
                        }
                    }

                    content = sb.ToString();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case))
                    {
                        content = content.ToLower();
                    }

                    if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens))
                    {
                        List <string> r  = content.getTokens(true, false, true, true, 1);
                        List <String> ur = new List <string>();
                        foreach (String rk in r)
                        {
                            if (!ur.Contains(rk))
                            {
                                ur.Add(rk);
                            }
                        }

                        StringBuilder sbr = new StringBuilder();
                        foreach (String rk in ur)
                        {
                            sbr.Append(rk + " ");
                        }
                        content = sbr.ToString();
                    }

                    content = TrimEmptySpace(content);
                }

                output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight));
            }

            return(output);
        }