예제 #1
        /// <summary>
        /// Instructs HTML to text extraction engine (EntityPlaneMethod) to produce text from xpath
        /// </summary>
        /// <param name="name">Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::</param>
        /// <param name="flags">The flags.</param>
        /// <param name="code">XPath associated with the instruction, selects nodes to be rendered into text</param>
        /// <param name="weight">Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)</param>
        /// <param name="expansion">The expansion.</param>
        /// <param name="remove">if set to <c>true</c> [remove].</param>
        /// <remarks>
        /// It will add specified instruction to the rendering instruction set, and optionally remove all existing instructions before it.
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_setRenderInstruction(
            [Description("Instruction name, it is human-readable descriptive name or special instructin name like ::BODYTEXT::")] String name = "::BODYTEXT::",
            [Description("Instruction flags, controls what and how to render")] DocumentRenderInstructionFlags flags   = DocumentRenderInstructionFlags.this_page_content,
            [Description("XPath associated with the instruction, selects nodes to be rendered into text")] String code = "",
            [Description("Weight factor of the instruction, i.e. number of times the content should be repeated (boosting TF)")] Double weight = 1.0,
            [Description("Graph selection expansion steps - to reach 1+ edges far nodes ")] Int32 expansion = 1,
            [Description("If true it will remove any existing instruction in the set")] Boolean remove      = false)
            if (remove)

            DocumentRenderInstruction dri = new DocumentRenderInstruction(name, code, weight);

            dri.instructionFlags    = flags;
            dri.graphExpansionSteps = expansion;
            dri.weight = weight;
        /// <summary>
        /// Renders the document aspect.
        /// </summary>
        /// <param name="sb">The sb.</param>
        /// <param name="doc">The document.</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <param name="flags">The flags.</param>
        /// <exception cref="NotImplementedException"></exception>
        public void RenderDocumentAspect(StringBuilder sb, WebSiteDocument doc, HtmlDocument htmlRelDoc, DocumentRenderInstructionFlags flags, String code = "")
            if (doc != null)
                if (flags.HasFlag(DocumentRenderInstructionFlags.page_title))
                    HtmlNode titleNode = htmlRelDoc.DocumentNode.Descendants("title").FirstOrDefault();
                    if (titleNode != null)
                        String titleString = titleNode.InnerText;

                    //sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_TITLE));

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_description))
                    IEnumerable <HtmlNode> metaNodes = htmlRelDoc.DocumentNode.Descendants("meta");
                    String dsc = "";
                    if (metaNodes != null)
                        foreach (HtmlNode nd in metaNodes)
                            String metaName = nd.GetAttributeValue("name", "");
                            if (metaName.Equals("description", StringComparison.CurrentCultureIgnoreCase))
                                dsc += nd.GetAttributeValue("content", "");

                    if (dsc.isNullOrEmpty())

                    // sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_DESCRIPTION));

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_content))
                    sb.AppendLine(BodyTextRender(htmlRelDoc, code));

                if (flags.HasFlag(DocumentRenderInstructionFlags.url_tokens))
                    List <string> r = doc.path.getTokens(true, false, true, true, 1);

                    foreach (String rk in r)
                        sb.Append(rk + " ");

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_xpath))
                    if (!code.isNullOrEmpty())
                        sb.AppendLine(Render(htmlRelDoc.DocumentNode, code));
        /// <summary>
        /// Renders the provided HTML source
        /// </summary>
        /// <param name="webPage">The web page.</param>
        /// <param name="site">The site.</param>
        /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <returns></returns>
        public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null)
            TextDocumentLayerCollection output = new TextDocumentLayerCollection();

            output.name = webPage.AssignedID;

            if (htmlRelDoc == null)
                htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource);

            foreach (DocumentRenderInstruction instruction in instructions)
                String        content = "";
                StringBuilder sb      = new StringBuilder();

                if (EnableRendering)
                    DocumentRenderInstructionFlags flags = instruction.instructionFlags;

                    if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page))
                        RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code);
                    if (flags.HasFlag(DocumentRenderInstructionFlags.select_links))
                        if (site.extensions.graph == null)
                            throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]");

                        imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection =

                        foreach (var link in selection.links)
                            if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption))
                                if (link.linkLabel.isNullOrEmpty())

                        foreach (var node in selection.linkedNodeClones.Values)
                            if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page))
                                WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault();
                                RenderDocumentAspect(sb, doc, null, flags, instruction.code);

                    content = sb.ToString();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case))
                        content = content.ToLower();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens))
                        List <string> r  = content.getTokens(true, false, true, true, 1);
                        List <String> ur = new List <string>();
                        foreach (String rk in r)
                            if (!ur.Contains(rk))

                        StringBuilder sbr = new StringBuilder();
                        foreach (String rk in ur)
                            sbr.Append(rk + " ");
                        content = sbr.ToString();

                    content = TrimEmptySpace(content);

                output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight));
