public void SetEntry(String _domainID, WebSiteDocument _webDocument, SpaceDocumentModel _spaceDocument, TextDocument _textDocument)
        {
            type     = DocumentSelectEntryType.unknown;
            DomainID = _domainID;

            webDocument   = _webDocument;
            spaceDocument = _spaceDocument;
            textDocument  = _textDocument;

            if (textDocument != null)
            {
                AssignedID = textDocument.name;
                type      |= DocumentSelectEntryType.textDocument;
            }
            if (spaceDocument != null)
            {
                AssignedID = spaceDocument.name;
                type      |= DocumentSelectEntryType.spaceDocument;
            }
            if (webDocument != null)
            {
                AssignedID = _webDocument.AssignedID;
                type      |= DocumentSelectEntryType.webDocument;
            }
        }
        /// <summary>
        /// Gets the web document source.
        /// </summary>
        /// <param name="page">The page.</param>
        /// <returns></returns>
        protected String GetWebDocumentSource(WebSiteDocument page, ILogBuilder logger = null)
        {
            StringBuilder sb = new StringBuilder();

            sb.AppendLine(page.HTTPHeader);
            sb.AppendLine(page.HTMLSource);
            return(sb.ToString());
        }
        /// <summary>
        /// Loads the web site document.
        /// </summary>
        /// <param name="fi">The fi.</param>
        /// <param name="webSite">The web site.</param>
        /// <returns></returns>
        private WebSiteDocument LoadWebSiteDocument(FileInfo fi, WebSiteDocuments webSite, WebDomainCategoryFormatOptions options)
        {
            WebSiteDocument output = null;
            String          path   = GetURLPathFromFilename(fi.Name);


            path = path.Substring(path.IndexOf(webSite.domain) + webSite.domain.Length);

            //path = path.TrimStart('//');
            //if (SelectPath.IsMatch(path))
            //{
            //    path = SelectPath.Match(path).Value;
            //} else
            //{
            //    path = path.removeStartsWith("http://" + webSite.domain );
            //}

            output = new WebSiteDocument(path, options.HasFlag(WebDomainCategoryFormatOptions.lazyLoading), fi.FullName);



            return(output);
        }
 public DocumentSelectResultEntry(WebSiteDocument document)
 {
     AssignedID  = document.AssignedID;
     type        = DocumentSelectEntryType.webDocument;
     webDocument = document;
 }
        /// <summary>
        /// Renders the document aspect.
        /// </summary>
        /// <param name="sb">The sb.</param>
        /// <param name="doc">The document.</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <param name="flags">The flags.</param>
        /// <exception cref="NotImplementedException"></exception>
        public void RenderDocumentAspect(StringBuilder sb, WebSiteDocument doc, HtmlDocument htmlRelDoc, DocumentRenderInstructionFlags flags, String code = "")
        {
            if (doc != null)
            {
                if (flags.HasFlag(DocumentRenderInstructionFlags.page_title))
                {
                    HtmlNode titleNode = htmlRelDoc.DocumentNode.Descendants("title").FirstOrDefault();
                    if (titleNode != null)
                    {
                        String titleString = titleNode.InnerText;
                        sb.AppendLine(titleString);
                    }

                    //sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_TITLE));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_description))
                {
                    IEnumerable <HtmlNode> metaNodes = htmlRelDoc.DocumentNode.Descendants("meta");
                    String dsc = "";
                    if (metaNodes != null)
                    {
                        foreach (HtmlNode nd in metaNodes)
                        {
                            String metaName = nd.GetAttributeValue("name", "");
                            if (metaName.Equals("description", StringComparison.CurrentCultureIgnoreCase))
                            {
                                dsc += nd.GetAttributeValue("content", "");
                            }
                        }
                    }

                    if (dsc.isNullOrEmpty())
                    {
                    }
                    else
                    {
                        sb.AppendLine(dsc);
                    }

                    // sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_DESCRIPTION));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_content))
                {
                    sb.AppendLine(BodyTextRender(htmlRelDoc, code));
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.url_tokens))
                {
                    List <string> r = doc.path.getTokens(true, false, true, true, 1);

                    foreach (String rk in r)
                    {
                        sb.Append(rk + " ");
                    }
                }

                if (flags.HasFlag(DocumentRenderInstructionFlags.page_xpath))
                {
                    if (!code.isNullOrEmpty())
                    {
                        sb.AppendLine(Render(htmlRelDoc.DocumentNode, code));
                    }
                }
            }
        }
        /// <summary>
        /// Renders the provided HTML source
        /// </summary>
        /// <param name="webPage">The web page.</param>
        /// <param name="site">The site.</param>
        /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <returns></returns>
        public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null)
        {
            TextDocumentLayerCollection output = new TextDocumentLayerCollection();

            output.name = webPage.AssignedID;

            if (htmlRelDoc == null)
            {
                htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource);
            }

            foreach (DocumentRenderInstruction instruction in instructions)
            {
                String        content = "";
                StringBuilder sb      = new StringBuilder();

                if (EnableRendering)
                {
                    DocumentRenderInstructionFlags flags = instruction.instructionFlags;

                    if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page))
                    {
                        RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code);
                    }
                    if (flags.HasFlag(DocumentRenderInstructionFlags.select_links))
                    {
                        if (site.extensions.graph == null)
                        {
                            throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]");
                        }

                        imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection =
                            site.extensions.graph.GetLinks(webPage.AssignedID,
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links),
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links));

                        foreach (var link in selection.links)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption))
                            {
                                if (link.linkLabel.isNullOrEmpty())
                                {
                                }
                                else
                                {
                                }
                                sb.AppendLine(link.linkLabel);
                            }
                        }

                        foreach (var node in selection.linkedNodeClones.Values)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page))
                            {
                                WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault();
                                RenderDocumentAspect(sb, doc, null, flags, instruction.code);
                            }
                        }
                    }

                    content = sb.ToString();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case))
                    {
                        content = content.ToLower();
                    }

                    if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens))
                    {
                        List <string> r  = content.getTokens(true, false, true, true, 1);
                        List <String> ur = new List <string>();
                        foreach (String rk in r)
                        {
                            if (!ur.Contains(rk))
                            {
                                ur.Add(rk);
                            }
                        }

                        StringBuilder sbr = new StringBuilder();
                        foreach (String rk in ur)
                        {
                            sbr.Append(rk + " ");
                        }
                        content = sbr.ToString();
                    }

                    content = TrimEmptySpace(content);
                }

                output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight));
            }

            return(output);
        }
        /// <summary>
        /// Loads the web sites.
        /// </summary>
        /// <param name="category">The category.</param>
        /// <param name="di">The di.</param>
        /// <param name="logger">The logger.</param>
        private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null)
        {
            FileInfo[] fileList = di.GetFiles();

            Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >();

            if (fileList.Length > 1)
            {
                foreach (FileInfo fi in fileList)
                {
                    String path = GetURLPathFromFilename(fi.Name);
                    if (path.StartsWith("http"))
                    {
                        Match m = SelectDomainName.Match(path);
                        if (m.Success)
                        {
                            String domain = m.Groups[1].Value;
                            if (!siteFilesIndex.ContainsKey(domain))
                            {
                                siteFilesIndex.Add(domain, new List <FileInfo>());
                            }

                            siteFilesIndex[domain].Add(fi);
                        }
                    }
                }

                if (logger != null)
                {
                    logger.log("Web sites detected: [" + siteFilesIndex.Count + "]");
                }



                foreach (String k in siteFilesIndex.Keys)
                {
                    WebSiteDocuments webSite = new WebSiteDocuments(k);

                    List <String> k_list = new List <string>();

                    foreach (FileInfo fi in siteFilesIndex[k])
                    {
                        WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options);

                        //if (fi.FullName[fi.FullName.Length - 1] == '7')
                        //{

                        //}


                        //String filename = webSite.domain.add(d.path, "/");
                        //filename = filename.Replace("//", "/");
                        //filename = "http://" + filename;
                        //filename = GetFilenameFromURLPath(filename);
                        //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename);



                        String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite);  //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path);
                        d.AssignedID = AssociatedID;
                        if (k_list.Contains(d.AssignedID))
                        {
                        }
                        else
                        {
                            k_list.Add(d.AssignedID);

                            webSite.documents.Add(d);
                        }
                    }

                    category.siteDocuments.Add(webSite);

                    if (logger != null)
                    {
                        logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]");
                    }
                }
            }
        }