/// <summary> /// Reduces the document set. /// </summary> /// <param name="docSet">The document set - web site.</param> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <returns>Rate of reduction</returns> public Double ReduceDocumentSet(WebSiteDocuments docSet, HtmlDocumentReductionSettings settings, ILogBuilder logger) { Int32 input = 0; Int32 output = 0; foreach (WebSiteDocument document in docSet.documents) { input += document.HTMLSource.Length; String newHtml = ReduceDocument(document.HTMLSource, settings, logger); output += newHtml.Length; document.HTMLSource = newHtml; } Double reduction = output.GetRatio(input); if (settings.logSiteLevel) { logger.AppendLine("[" + docSet.domain + "] reduced to: " + reduction.ToString("P2")); } return(reduction); }
public WebSiteDocuments GetOrAdd(String domainName) { if (!siteDocuments.Any(x => x.domain == domainName)) { WebSiteDocuments output = new WebSiteDocuments(domainName); siteDocuments.Add(output); return(output); } else { return(siteDocuments.FirstOrDefault(x => x.domain == domainName)); } }
public void SaveWebSite(WebSiteDocuments site, folderNode folder) { foreach (WebSiteDocument page in site.documents) { String filename = site.domain.add(page.path, "/"); filename = filename.Replace("//", "/"); filename = "http://" + filename; filename = GetFilenameFromURLPath(filename); filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); String p = folder.pathFor(filename, imbSCI.Data.enums.getWritableFileMode.existing, "Page of [" + site.domain + "] at path [" + page.path + "]", false); String source = GetWebDocumentSource(page); if (!File.Exists(p)) { File.WriteAllText(p, source); } } }
/// <summary> /// Renders a web site into set of documents /// </summary> /// <param name="site">The site.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public TextDocumentSet RenderSiteDocuments(WebSiteDocuments site, ILogBuilder logger, Boolean EnableRendering = true) { TextDocumentSet textSet = new TextDocumentSet(site.domain); //Parallel.ForEach(site.documents, (webPage) => //{ // TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering); // pg.name = webPage.AssociatedID; // textSet.Add(pg); //}); foreach (WebSiteDocument webPage in site.documents) { TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering); pg.name = webPage.AssignedID; textSet.Add(pg); } return(textSet); }
/// <summary> /// Loads the web site document. /// </summary> /// <param name="fi">The fi.</param> /// <param name="webSite">The web site.</param> /// <returns></returns> private WebSiteDocument LoadWebSiteDocument(FileInfo fi, WebSiteDocuments webSite, WebDomainCategoryFormatOptions options) { WebSiteDocument output = null; String path = GetURLPathFromFilename(fi.Name); path = path.Substring(path.IndexOf(webSite.domain) + webSite.domain.Length); //path = path.TrimStart('//'); //if (SelectPath.IsMatch(path)) //{ // path = SelectPath.Match(path).Value; //} else //{ // path = path.removeStartsWith("http://" + webSite.domain ); //} output = new WebSiteDocument(path, options.HasFlag(WebDomainCategoryFormatOptions.lazyLoading), fi.FullName); return(output); }
/// <summary> /// Renders the provided HTML source /// </summary> /// <param name="webPage">The web page.</param> /// <param name="site">The site.</param> /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param> /// <param name="htmlRelDoc">The HTML relative document.</param> /// <returns></returns> public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null) { TextDocumentLayerCollection output = new TextDocumentLayerCollection(); output.name = webPage.AssignedID; if (htmlRelDoc == null) { htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource); } foreach (DocumentRenderInstruction instruction in instructions) { String content = ""; StringBuilder sb = new StringBuilder(); if (EnableRendering) { DocumentRenderInstructionFlags flags = instruction.instructionFlags; if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page)) { RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code); } if (flags.HasFlag(DocumentRenderInstructionFlags.select_links)) { if (site.extensions.graph == null) { throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]"); } imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection = site.extensions.graph.GetLinks(webPage.AssignedID, instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links), instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links)); foreach (var link in selection.links) { if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption)) { if (link.linkLabel.isNullOrEmpty()) { } else { } sb.AppendLine(link.linkLabel); } } foreach (var node in selection.linkedNodeClones.Values) { if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page)) { WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault(); RenderDocumentAspect(sb, doc, null, flags, instruction.code); } } } content = sb.ToString(); if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case)) { content = content.ToLower(); } if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens)) { List <string> r = content.getTokens(true, false, true, true, 1); List <String> ur = new List <string>(); foreach (String rk in r) { if (!ur.Contains(rk)) { ur.Add(rk); } } StringBuilder sbr = new StringBuilder(); foreach (String rk in ur) { sbr.Append(rk + " "); } content = sbr.ToString(); } content = TrimEmptySpace(content); } output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight)); } return(output); }
/// <summary> /// Loads the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="logger">The logger.</param> private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { FileInfo[] fileList = di.GetFiles(); Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >(); if (fileList.Length > 1) { foreach (FileInfo fi in fileList) { String path = GetURLPathFromFilename(fi.Name); if (path.StartsWith("http")) { Match m = SelectDomainName.Match(path); if (m.Success) { String domain = m.Groups[1].Value; if (!siteFilesIndex.ContainsKey(domain)) { siteFilesIndex.Add(domain, new List <FileInfo>()); } siteFilesIndex[domain].Add(fi); } } } if (logger != null) { logger.log("Web sites detected: [" + siteFilesIndex.Count + "]"); } foreach (String k in siteFilesIndex.Keys) { WebSiteDocuments webSite = new WebSiteDocuments(k); List <String> k_list = new List <string>(); foreach (FileInfo fi in siteFilesIndex[k]) { WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options); //if (fi.FullName[fi.FullName.Length - 1] == '7') //{ //} //String filename = webSite.domain.add(d.path, "/"); //filename = filename.Replace("//", "/"); //filename = "http://" + filename; //filename = GetFilenameFromURLPath(filename); //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite); //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path); d.AssignedID = AssociatedID; if (k_list.Contains(d.AssignedID)) { } else { k_list.Add(d.AssignedID); webSite.documents.Add(d); } } category.siteDocuments.Add(webSite); if (logger != null) { logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]"); } } } }
/// <summary> /// Executes the plane method, invoking contained functions according to the settings /// </summary> /// <param name="inputContext">The input context - related to this plane.</param> /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param> /// <param name="logger">The logger.</param> /// <returns> /// Retur /// </returns> public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger) { if (notes != null) { notes.logStartPhase("[1] Entity Plane - execution", ""); } IEntityPlaneContext context = inputContext as IEntityPlaneContext; CorpusPlaneContext outputContext = new CorpusPlaneContext(); outputContext.provider.StoreAndReceive(context); outputContext.dataset = context.dataset; // ---------------- rendering procedure Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >(); Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>(); Dictionary <WebSiteDocuments, TextDocumentSet> sitesToRenders = new Dictionary <WebSiteDocuments, TextDocumentSet>(); Dictionary <String, WebSiteDocuments> inputSites = new Dictionary <string, WebSiteDocuments>(); Dictionary <String, TextDocumentSet> inputTextRenders = new Dictionary <string, TextDocumentSet>(); Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >(); Int32 c = 0; // rendering foreach (WebSiteDocumentsSet docSet in context.dataset) { if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN) { outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN); labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown); } else { SpaceLabel lab = new SpaceLabel(docSet.name); labels.Add(lab.name, lab); outputContext.space.labels.Add(lab); } String datasetSignature = context.dataset.GetDataSetSignature(); // ---- render List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>(); if (CacheProvider.IsReady) { foreach (WebSiteDocuments site in docSet) { TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain); if (tds == null) { tds = render.RenderSiteDocuments(site, logger); CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds); } else { tds.name = site.domain; } textSetForLabel.Add(tds); } } else { textSetForLabel = render.RenderDocumentSet(docSet, logger); foreach (TextDocumentSet ws in textSetForLabel) { CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws); } } // // <--- performs the rendering textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x)); // --- rest of indexing docSet.ForEach(x => inputSites.Add(x.domain, x)); renderIndex.Add(docSet, textSetForLabel); foreach (WebSiteDocuments site in docSet) { inputSiteVsLabels.Add(site, new List <SpaceLabel>()); inputSiteVsLabels[site].Add(labels[docSet.name]); c++; } } if (notes != null) { notes.log("Text document for [" + c + "] entities created"); } // tmp index foreach (String key in inputSites.Keys) { sitesToRenders.Add(inputSites[key], inputTextRenders[key]); } // page in site filtering if (filter.IsEnabled) { Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>(); filter.Learn(inputTextRenders.Values); foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders) { renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value)); } sitesToRenders = renderIndexFiltered; } Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>(); foreach (var pair in sitesToRenders) { TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value); } // blending pages into single page per web site // DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages); Boolean keepSeparated = blender.DoKeepPagesSeparated; foreach (var pair in renderIndex) { foreach (TextDocumentSet entitySet in pair.Value) { TextDocumentSet selectedTexts = TextDocumentsByDomainName[entitySet.name]; WebSiteDocuments web = inputSites[entitySet.name]; IEnumerable <string> label = inputSiteVsLabels[web].Select(x => x.name); if (keepSeparated) { // filter function TextDocument doc = blender.blendToTextDocument(selectedTexts); doc.labels.AddRange(label); outputContext.corpus_documents.Add(doc); } else { var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts); foreach (TextDocument doc in docs) { doc.labels.AddRange(label); outputContext.corpus_documents.Add(doc); } } } } if (notes != null) { notes.logEndPhase(); } return(outputContext); }