/// <summary>
        /// Reduces the document set.
        /// </summary>
        /// <param name="docSet">The document set - web site.</param>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <returns>Rate of reduction</returns>
        public Double ReduceDocumentSet(WebSiteDocuments docSet, HtmlDocumentReductionSettings settings, ILogBuilder logger)
        {
            Int32 input  = 0;
            Int32 output = 0;

            foreach (WebSiteDocument document in docSet.documents)
            {
                input += document.HTMLSource.Length;

                String newHtml = ReduceDocument(document.HTMLSource, settings, logger);

                output += newHtml.Length;


                document.HTMLSource = newHtml;
            }

            Double reduction = output.GetRatio(input);

            if (settings.logSiteLevel)
            {
                logger.AppendLine("[" + docSet.domain + "] reduced to: " + reduction.ToString("P2"));
            }

            return(reduction);
        }
示例#2
0
 public WebSiteDocuments GetOrAdd(String domainName)
 {
     if (!siteDocuments.Any(x => x.domain == domainName))
     {
         WebSiteDocuments output = new WebSiteDocuments(domainName);
         siteDocuments.Add(output);
         return(output);
     }
     else
     {
         return(siteDocuments.FirstOrDefault(x => x.domain == domainName));
     }
 }
        public void SaveWebSite(WebSiteDocuments site, folderNode folder)
        {
            foreach (WebSiteDocument page in site.documents)
            {
                String filename = site.domain.add(page.path, "/");
                filename = filename.Replace("//", "/");
                filename = "http://" + filename;
                filename = GetFilenameFromURLPath(filename);
                filename = WebSiteDocumentsSetTools.GetSafeFilename(filename);

                String p = folder.pathFor(filename, imbSCI.Data.enums.getWritableFileMode.existing, "Page of [" + site.domain + "] at path [" + page.path + "]", false);

                String source = GetWebDocumentSource(page);
                if (!File.Exists(p))
                {
                    File.WriteAllText(p, source);
                }
            }
        }
        /// <summary>
        /// Renders a web site into set of documents
        /// </summary>
        /// <param name="site">The site.</param>
        /// <param name="logger">The logger.</param>
        /// <returns></returns>
        public TextDocumentSet RenderSiteDocuments(WebSiteDocuments site, ILogBuilder logger, Boolean EnableRendering = true)
        {
            TextDocumentSet textSet = new TextDocumentSet(site.domain);

            //Parallel.ForEach(site.documents, (webPage) =>
            //{
            //    TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering);
            //    pg.name = webPage.AssociatedID;
            //    textSet.Add(pg);
            //});

            foreach (WebSiteDocument webPage in site.documents)
            {
                TextDocumentLayerCollection pg = RenderText(webPage, site, EnableRendering);

                pg.name = webPage.AssignedID;
                textSet.Add(pg);
            }

            return(textSet);
        }
        /// <summary>
        /// Loads the web site document.
        /// </summary>
        /// <param name="fi">The fi.</param>
        /// <param name="webSite">The web site.</param>
        /// <returns></returns>
        private WebSiteDocument LoadWebSiteDocument(FileInfo fi, WebSiteDocuments webSite, WebDomainCategoryFormatOptions options)
        {
            WebSiteDocument output = null;
            String          path   = GetURLPathFromFilename(fi.Name);


            path = path.Substring(path.IndexOf(webSite.domain) + webSite.domain.Length);

            //path = path.TrimStart('//');
            //if (SelectPath.IsMatch(path))
            //{
            //    path = SelectPath.Match(path).Value;
            //} else
            //{
            //    path = path.removeStartsWith("http://" + webSite.domain );
            //}

            output = new WebSiteDocument(path, options.HasFlag(WebDomainCategoryFormatOptions.lazyLoading), fi.FullName);



            return(output);
        }
        /// <summary>
        /// Renders the provided HTML source
        /// </summary>
        /// <param name="webPage">The web page.</param>
        /// <param name="site">The site.</param>
        /// <param name="EnableRendering">if set to <c>true</c> [enable rendering].</param>
        /// <param name="htmlRelDoc">The HTML relative document.</param>
        /// <returns></returns>
        public TextDocumentLayerCollection RenderText(WebSiteDocument webPage, WebSiteDocuments site, Boolean EnableRendering = true, HtmlDocument htmlRelDoc = null)
        {
            TextDocumentLayerCollection output = new TextDocumentLayerCollection();

            output.name = webPage.AssignedID;

            if (htmlRelDoc == null)
            {
                htmlRelDoc = HtmlDocumentCache.DefaultDocumentCache.GetDocument(webPage.AssignedID, webPage.HTMLSource);
            }

            foreach (DocumentRenderInstruction instruction in instructions)
            {
                String        content = "";
                StringBuilder sb      = new StringBuilder();

                if (EnableRendering)
                {
                    DocumentRenderInstructionFlags flags = instruction.instructionFlags;

                    if (flags.HasFlag(DocumentRenderInstructionFlags.cur_page))
                    {
                        RenderDocumentAspect(sb, webPage, htmlRelDoc, flags, instruction.code);
                    }
                    if (flags.HasFlag(DocumentRenderInstructionFlags.select_links))
                    {
                        if (site.extensions.graph == null)
                        {
                            throw new nlpException("WebGraph is null - can't render instruction [" + instruction.name + "]", "Graph is null for site [" + site.domain + "]");
                        }

                        imbSCI.Graph.FreeGraph.freeGraphNodeAndLinks selection =
                            site.extensions.graph.GetLinks(webPage.AssignedID,
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_outbound_links),
                                                           instruction.instructionFlags.HasFlag(DocumentRenderInstructionFlags.select_inbound_links));

                        foreach (var link in selection.links)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.link_caption))
                            {
                                if (link.linkLabel.isNullOrEmpty())
                                {
                                }
                                else
                                {
                                }
                                sb.AppendLine(link.linkLabel);
                            }
                        }

                        foreach (var node in selection.linkedNodeClones.Values)
                        {
                            if (flags.HasFlag(DocumentRenderInstructionFlags.select_rel_page))
                            {
                                WebSiteDocument doc = site.documents.Where(x => x.AssignedID == node.name).FirstOrDefault();
                                RenderDocumentAspect(sb, doc, null, flags, instruction.code);
                            }
                        }
                    }

                    content = sb.ToString();

                    if (flags.HasFlag(DocumentRenderInstructionFlags.lower_case))
                    {
                        content = content.ToLower();
                    }

                    if (flags.HasFlag(DocumentRenderInstructionFlags.unique_tokens))
                    {
                        List <string> r  = content.getTokens(true, false, true, true, 1);
                        List <String> ur = new List <string>();
                        foreach (String rk in r)
                        {
                            if (!ur.Contains(rk))
                            {
                                ur.Add(rk);
                            }
                        }

                        StringBuilder sbr = new StringBuilder();
                        foreach (String rk in ur)
                        {
                            sbr.Append(rk + " ");
                        }
                        content = sbr.ToString();
                    }

                    content = TrimEmptySpace(content);
                }

                output.CreateLayer(instruction.name, content, Convert.ToInt32(instruction.weight));
            }

            return(output);
        }
        /// <summary>
        /// Loads the web sites.
        /// </summary>
        /// <param name="category">The category.</param>
        /// <param name="di">The di.</param>
        /// <param name="logger">The logger.</param>
        private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null)
        {
            FileInfo[] fileList = di.GetFiles();

            Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >();

            if (fileList.Length > 1)
            {
                foreach (FileInfo fi in fileList)
                {
                    String path = GetURLPathFromFilename(fi.Name);
                    if (path.StartsWith("http"))
                    {
                        Match m = SelectDomainName.Match(path);
                        if (m.Success)
                        {
                            String domain = m.Groups[1].Value;
                            if (!siteFilesIndex.ContainsKey(domain))
                            {
                                siteFilesIndex.Add(domain, new List <FileInfo>());
                            }

                            siteFilesIndex[domain].Add(fi);
                        }
                    }
                }

                if (logger != null)
                {
                    logger.log("Web sites detected: [" + siteFilesIndex.Count + "]");
                }



                foreach (String k in siteFilesIndex.Keys)
                {
                    WebSiteDocuments webSite = new WebSiteDocuments(k);

                    List <String> k_list = new List <string>();

                    foreach (FileInfo fi in siteFilesIndex[k])
                    {
                        WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options);

                        //if (fi.FullName[fi.FullName.Length - 1] == '7')
                        //{

                        //}


                        //String filename = webSite.domain.add(d.path, "/");
                        //filename = filename.Replace("//", "/");
                        //filename = "http://" + filename;
                        //filename = GetFilenameFromURLPath(filename);
                        //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename);



                        String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite);  //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path);
                        d.AssignedID = AssociatedID;
                        if (k_list.Contains(d.AssignedID))
                        {
                        }
                        else
                        {
                            k_list.Add(d.AssignedID);

                            webSite.documents.Add(d);
                        }
                    }

                    category.siteDocuments.Add(webSite);

                    if (logger != null)
                    {
                        logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]");
                    }
                }
            }
        }
示例#8
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            if (notes != null)
            {
                notes.logStartPhase("[1] Entity Plane - execution", "");
            }

            IEntityPlaneContext context       = inputContext as IEntityPlaneContext;
            CorpusPlaneContext  outputContext = new CorpusPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            outputContext.dataset = context.dataset;

            // ---------------- rendering procedure
            Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >();
            Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>();

            Dictionary <WebSiteDocuments, TextDocumentSet>    sitesToRenders    = new Dictionary <WebSiteDocuments, TextDocumentSet>();
            Dictionary <String, WebSiteDocuments>             inputSites        = new Dictionary <string, WebSiteDocuments>();
            Dictionary <String, TextDocumentSet>              inputTextRenders  = new Dictionary <string, TextDocumentSet>();
            Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >();

            Int32 c = 0;

            // rendering
            foreach (WebSiteDocumentsSet docSet in context.dataset)
            {
                if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN)
                {
                    outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN);
                    labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown);
                }
                else
                {
                    SpaceLabel lab = new SpaceLabel(docSet.name);
                    labels.Add(lab.name, lab);
                    outputContext.space.labels.Add(lab);
                }

                String datasetSignature = context.dataset.GetDataSetSignature();

                // ---- render
                List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();

                if (CacheProvider.IsReady)
                {
                    foreach (WebSiteDocuments site in docSet)
                    {
                        TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain);

                        if (tds == null)
                        {
                            tds = render.RenderSiteDocuments(site, logger);
                            CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds);
                        }
                        else
                        {
                            tds.name = site.domain;
                        }


                        textSetForLabel.Add(tds);
                    }
                }
                else
                {
                    textSetForLabel = render.RenderDocumentSet(docSet, logger);
                    foreach (TextDocumentSet ws in textSetForLabel)
                    {
                        CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws);
                    }
                }

                // // <--- performs the rendering

                textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x));
                // --- rest of indexing

                docSet.ForEach(x => inputSites.Add(x.domain, x));
                renderIndex.Add(docSet, textSetForLabel);


                foreach (WebSiteDocuments site in docSet)
                {
                    inputSiteVsLabels.Add(site, new List <SpaceLabel>());
                    inputSiteVsLabels[site].Add(labels[docSet.name]);
                    c++;
                }
            }

            if (notes != null)
            {
                notes.log("Text document for [" + c + "] entities created");
            }

            // tmp index
            foreach (String key in inputSites.Keys)
            {
                sitesToRenders.Add(inputSites[key], inputTextRenders[key]);
            }

            // page in site filtering
            if (filter.IsEnabled)
            {
                Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>();

                filter.Learn(inputTextRenders.Values);

                foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders)
                {
                    renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value));
                }
                sitesToRenders = renderIndexFiltered;
            }


            Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>();

            foreach (var pair in sitesToRenders)
            {
                TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value);
            }



            // blending pages into single page per web site
            //  DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages);

            Boolean keepSeparated = blender.DoKeepPagesSeparated;

            foreach (var pair in renderIndex)
            {
                foreach (TextDocumentSet entitySet in pair.Value)
                {
                    TextDocumentSet      selectedTexts = TextDocumentsByDomainName[entitySet.name];
                    WebSiteDocuments     web           = inputSites[entitySet.name];
                    IEnumerable <string> label         = inputSiteVsLabels[web].Select(x => x.name);

                    if (keepSeparated)
                    {
                        // filter function
                        TextDocument doc = blender.blendToTextDocument(selectedTexts);
                        doc.labels.AddRange(label);
                        outputContext.corpus_documents.Add(doc);
                    }
                    else
                    {
                        var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts);
                        foreach (TextDocument doc in docs)
                        {
                            doc.labels.AddRange(label);
                            outputContext.corpus_documents.Add(doc);
                        }
                    }
                }
            }

            if (notes != null)
            {
                notes.logEndPhase();
            }


            return(outputContext);
        }