/// <summary> /// Renders a web site into set of documents /// </summary> /// <param name="site">The site.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public TextDocumentSet RenderSiteDocuments(WebSiteDocuments site, ILogBuilder logger, Boolean EnableRendering = true) { TextDocumentSet textSet = new TextDocumentSet(site.domain); //Parallel.ForEach(site.documents, (webPage) => //{ // TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering); // pg.name = webPage.AssociatedID; // textSet.Add(pg); //}); foreach (WebSiteDocument webPage in site.documents) { TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering); pg.name = webPage.AssignedID; textSet.Add(pg); } return(textSet); }
/// <summary> /// Breaks to content units. /// </summary> /// <param name="layers">The layers.</param> /// <param name="options">The options.</param> /// <returns></returns> /// <exception cref="NotImplementedException"> /// </exception> public List <String> breakToContentUnits(TextDocumentLayerCollection layers, DocumentBlenderFunctionOptions options) { List <String> units = new List <string>(); if (options.HasFlag(DocumentBlenderFunctionOptions.pageLevel)) { units.Add(layers.ToString()); return(units); } if (options.HasFlag(DocumentBlenderFunctionOptions.blockLevel)) { throw new NotImplementedException();//units.Add(layers.ToString()); return(units); } if (options.HasFlag(DocumentBlenderFunctionOptions.sentenceLevel)) { throw new NotImplementedException(); //units.Add(layers.ToString()); return(units); } return(units); }
public override double Compute(TextDocumentLayerCollection document, String documentSet) { return(stats[documentSet][document.name].TotalScore); }
/// <summary> /// Computes the specified document. /// </summary> /// <param name="document">The document.</param> /// <param name="documentSet">The document set.</param> /// <returns></returns> public override double Compute(TextDocumentLayerCollection document, String documentSet) { return(stats[documentSet][document.name].entropyFreq); }
public override double Compute(TextDocumentLayerCollection document, string parentID) { throw new NotImplementedException(); }
/// <summary> /// Renders the provided HTML source /// </summary> /// <param name="webPage">The web page.</param> /// <param name="site">The site.</param> /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param> /// <param name="htmlRelDoc">The HTML relative document.</param> /// <returns></returns> public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null) { TextDocumentLayerCollection output = new TextDocumentLayerCollection(); output.name = webPage.AssignedID; if (htmlRelDoc == null) { htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource); } foreach (DocumentRenderInstruction instruction in instructions) { String content = ""; StringBuilder sb = new StringBuilder(); if (EnableRendering) { DocumentRenderInstructionFlags flags = instruction.instructionFlags; if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page)) { RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code); } if (flags.HasFlag(DocumentRenderInstructionFlags.select_links)) { if (site.extensions.graph == null) { throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]"); } imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection = site.extensions.graph.GetLinks(webPage.AssignedID, instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links), instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links)); foreach (var link in selection.links) { if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption)) { if (link.linkLabel.isNullOrEmpty()) { } else { } sb.AppendLine(link.linkLabel); } } foreach (var node in selection.linkedNodeClones.Values) { if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page)) { WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault(); RenderDocumentAspect(sb, doc, null, flags, instruction.code); } } } content = sb.ToString(); if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case)) { content = content.ToLower(); } if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens)) { List <string> r = content.getTokens(true, false, true, true, 1); List <String> ur = new List <string>(); foreach (String rk in r) { if (!ur.Contains(rk)) { ur.Add(rk); } } StringBuilder sbr = new StringBuilder(); foreach (String rk in ur) { sbr.Append(rk + " "); } content = sbr.ToString(); } content = TrimEmptySpace(content); } output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight)); } return(output); }
public abstract Double Compute(TextDocumentLayerCollection document, String parentID);