public void SetEntry(String _domainID, WebSiteDocument _webDocument, SpaceDocumentModel _spaceDocument, TextDocument _textDocument) { type = DocumentSelectEntryType.unknown; DomainID = _domainID; webDocument = _webDocument; spaceDocument = _spaceDocument; textDocument = _textDocument; if (textDocument != null) { AssignedID = textDocument.name; type |= DocumentSelectEntryType.textDocument; } if (spaceDocument != null) { AssignedID = spaceDocument.name; type |= DocumentSelectEntryType.spaceDocument; } if (webDocument != null) { AssignedID = _webDocument.AssignedID; type |= DocumentSelectEntryType.webDocument; } }
/// <summary> /// Gets the web document source. /// </summary> /// <param name="page">The page.</param> /// <returns></returns> protected String GetWebDocumentSource(WebSiteDocument page, ILogBuilder logger = null) { StringBuilder sb = new StringBuilder(); sb.AppendLine(page.HTTPHeader); sb.AppendLine(page.HTMLSource); return(sb.ToString()); }
/// <summary> /// Loads the web site document. /// </summary> /// <param name="fi">The fi.</param> /// <param name="webSite">The web site.</param> /// <returns></returns> private WebSiteDocument LoadWebSiteDocument(FileInfo fi, WebSiteDocuments webSite, WebDomainCategoryFormatOptions options) { WebSiteDocument output = null; String path = GetURLPathFromFilename(fi.Name); path = path.Substring(path.IndexOf(webSite.domain) + webSite.domain.Length); //path = path.TrimStart('//'); //if (SelectPath.IsMatch(path)) //{ // path = SelectPath.Match(path).Value; //} else //{ // path = path.removeStartsWith("http://" + webSite.domain ); //} output = new WebSiteDocument(path, options.HasFlag(WebDomainCategoryFormatOptions.lazyLoading), fi.FullName); return(output); }
public DocumentSelectResultEntry(WebSiteDocument document) { AssignedID = document.AssignedID; type = DocumentSelectEntryType.webDocument; webDocument = document; }
/// <summary> /// Renders the document aspect. /// </summary> /// <param name="sb">The sb.</param> /// <param name="doc">The document.</param> /// <param name="htmlRelDoc">The HTML relative document.</param> /// <param name="flags">The flags.</param> /// <exception cref="NotImplementedException"></exception> public void RenderDocumentAspect(StringBuilder sb, WebSiteDocument doc, HtmlDocument htmlRelDoc, DocumentRenderInstructionFlags flags, String code = "") { if (doc != null) { if (flags.HasFlag(DocumentRenderInstructionFlags.page_title)) { HtmlNode titleNode = htmlRelDoc.DocumentNode.Descendants("title").FirstOrDefault(); if (titleNode != null) { String titleString = titleNode.InnerText; sb.AppendLine(titleString); } //sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_TITLE)); } if (flags.HasFlag(DocumentRenderInstructionFlags.page_description)) { IEnumerable <HtmlNode> metaNodes = htmlRelDoc.DocumentNode.Descendants("meta"); String dsc = ""; if (metaNodes != null) { foreach (HtmlNode nd in metaNodes) { String metaName = nd.GetAttributeValue("name", ""); if (metaName.Equals("description", StringComparison.CurrentCultureIgnoreCase)) { dsc += nd.GetAttributeValue("content", ""); } } } if (dsc.isNullOrEmpty()) { } else { sb.AppendLine(dsc); } // sb.AppendLine(Render(htmlRelDoc.DocumentNode, DocumentRenderInstruction.XPATH_SELECT_DESCRIPTION)); } if (flags.HasFlag(DocumentRenderInstructionFlags.page_content)) { sb.AppendLine(BodyTextRender(htmlRelDoc, code)); } if (flags.HasFlag(DocumentRenderInstructionFlags.url_tokens)) { List <string> r = doc.path.getTokens(true, false, true, true, 1); foreach (String rk in r) { sb.Append(rk + " "); } } if (flags.HasFlag(DocumentRenderInstructionFlags.page_xpath)) { if (!code.isNullOrEmpty()) { sb.AppendLine(Render(htmlRelDoc.DocumentNode, code)); } } } }
/// <summary> /// Renders the provided HTML source /// </summary> /// <param name="webPage">The web page.</param> /// <param name="site">The site.</param> /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param> /// <param name="htmlRelDoc">The HTML relative document.</param> /// <returns></returns> public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null) { TextDocumentLayerCollection output = new TextDocumentLayerCollection(); output.name = webPage.AssignedID; if (htmlRelDoc == null) { htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource); } foreach (DocumentRenderInstruction instruction in instructions) { String content = ""; StringBuilder sb = new StringBuilder(); if (EnableRendering) { DocumentRenderInstructionFlags flags = instruction.instructionFlags; if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page)) { RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code); } if (flags.HasFlag(DocumentRenderInstructionFlags.select_links)) { if (site.extensions.graph == null) { throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]"); } imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection = site.extensions.graph.GetLinks(webPage.AssignedID, instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links), instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links)); foreach (var link in selection.links) { if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption)) { if (link.linkLabel.isNullOrEmpty()) { } else { } sb.AppendLine(link.linkLabel); } } foreach (var node in selection.linkedNodeClones.Values) { if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page)) { WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault(); RenderDocumentAspect(sb, doc, null, flags, instruction.code); } } } content = sb.ToString(); if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case)) { content = content.ToLower(); } if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens)) { List <string> r = content.getTokens(true, false, true, true, 1); List <String> ur = new List <string>(); foreach (String rk in r) { if (!ur.Contains(rk)) { ur.Add(rk); } } StringBuilder sbr = new StringBuilder(); foreach (String rk in ur) { sbr.Append(rk + " "); } content = sbr.ToString(); } content = TrimEmptySpace(content); } output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight)); } return(output); }
/// <summary> /// Loads the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="logger">The logger.</param> private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { FileInfo[] fileList = di.GetFiles(); Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >(); if (fileList.Length > 1) { foreach (FileInfo fi in fileList) { String path = GetURLPathFromFilename(fi.Name); if (path.StartsWith("http")) { Match m = SelectDomainName.Match(path); if (m.Success) { String domain = m.Groups[1].Value; if (!siteFilesIndex.ContainsKey(domain)) { siteFilesIndex.Add(domain, new List <FileInfo>()); } siteFilesIndex[domain].Add(fi); } } } if (logger != null) { logger.log("Web sites detected: [" + siteFilesIndex.Count + "]"); } foreach (String k in siteFilesIndex.Keys) { WebSiteDocuments webSite = new WebSiteDocuments(k); List <String> k_list = new List <string>(); foreach (FileInfo fi in siteFilesIndex[k]) { WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options); //if (fi.FullName[fi.FullName.Length - 1] == '7') //{ //} //String filename = webSite.domain.add(d.path, "/"); //filename = filename.Replace("//", "/"); //filename = "http://" + filename; //filename = GetFilenameFromURLPath(filename); //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite); //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path); d.AssignedID = AssociatedID; if (k_list.Contains(d.AssignedID)) { } else { k_list.Add(d.AssignedID); webSite.documents.Add(d); } } category.siteDocuments.Add(webSite); if (logger != null) { logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]"); } } } }