Beispiel #1
0
        //public String SetPageTFIDF(webPageTF wpTF, indexDomain idomain)
        //{
        //    String hash = idomain.HashCode + "-" + wpTF.ipage.HashCode;
        //    var dt = wpTF.GetDataTable(wpTF.ipage.url);
        //    String path = imbWEMManager.index.folder.pathFor(hash + ".xml");

        //    objectSerialization.saveObjectToXML<webPageTF>(path, wpTF);
        //    dt.Save(imbWEMManager.index.experimentEntry.crawlRecordFolder, imbWEMManager.authorNotation, hash);

        //    wpTF.ipage.TFIDFcompiled = true;

        //    wpTF.ipage.Lemmas = wpTF.Count();


        //    AddOrUpdate(wpTF.ipage);

        //    dt.GetReportAndSave(imbWEMManager.index.experimentEntry.crawlRecordFolder, imbWEMManager.authorNotation, hash, true);
        //    return path;
        //}


        /// <summary>
        /// Returns all pages for the domain specified
        /// </summary>
        /// <param name="domainName">Name of the domain.</param>
        /// <returns></returns>
        public List <indexPage> GetPagesForDomain(string domainName)
        {
            domainAnalysis da = new domainAnalysis(domainName);


            var rows = tableSelect("domain = '" + da.domainName + "'");

            var pages = GetObjectFromRows(rows);

            List <indexPage> output = new List <indexPage>();
            List <string>    urls   = new List <string>();

            foreach (indexPage page in pages)
            {
                if (urls.Contains(page.url))
                {
                }
                else
                {
                    output.Add(page);
                    urls.Add(page.url);
                }
            }
            return(output);
        }
Beispiel #2
0
        public modelSpiderSiteRecord(string __testRunStamp, spiderWeb __instance) : base(__testRunStamp, __instance)
        {
            //logBuilder.isEnabled = imbWEMManager.settings.executionLog.doKeepSiteRec;


            domainInfo           = new domainAnalysis(instance.seedLink.url);
            iterationTableRecord = new objectTable <iterationPerformanceRecord>("key", "iteration_" + domainInfo.domainName.Replace(".", "_"));



            //moduleRecords.start()

            // domainInfo = new domainAnalysis(__instance.seedLink.url);


            // stats.Add(modelSpiderSideFields.mss_totalcrosslinks, 0, "Total crosslinks", "Total number of crosslinks detected among pages");

            //stats.Add(modelSpiderSiteTimelineEnum.tl_iteration, 0, "Iteration", "Final iteration count");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesloaded, 0, "Pages loaded", "Total count of pages loaded");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesdetected, 0, "Pages detected", "Total count of pages loaded");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_pagesaccepted, 0, "Pages accepted", "Total count of pages loaded");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_totallinks, 0, "Links detected", "Total links processed");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_linksaccepted, 0, "Links accepted", "Total links processed");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_activelinks, 0, "Ative links", "Active links left at the end of procedure");
            //stats.Add(modelSpiderSiteTimelineEnum.tl_stability, 0, "Pages loaded", "Total links processed");
        }
 public webSiteProfile(String _url)
 {
     domainInfo = new domainAnalysis(_url);
     url        = domainInfo.urlProper;
     domain     = domainInfo.domainName;
     name       = domainInfo.domainRootName;
 }
Beispiel #4
0
        public int Add(List <indexDomain> __indexDomains, int start = 0, int take = -1)
        {
            int c = 0;
            int i = 0;

            if (take == -1)
            {
                take = __indexDomains.Count - start;
            }

            foreach (indexDomain domain in __indexDomains)
            {
                if ((i >= start) && (i < (start + take)))
                {
                    domainAnalysis da = new domainAnalysis(domain.url);
                    if (Add(da.urlProper))
                    {
                        indexDomains.Add(domain);
                        c++;
                    }
                }
                i++;
            }
            return(c);
        }
        internal void deploy(imbMCWebSiteEntry __entry)
        {
            entry = __entry;

            if (domainInfo == null)
            {
                domainInfo = new domainAnalysis(entry.domainProperUrl);
            }

            name = domainInfo.domainRootName;
            if (domainInfo.domainRootName.isNullOrEmpty())
            {
            }
        }
        public static String GetCleanCaseName(String properUrl)
        {
            String output = properUrl;

            if (output.ContainsAny(new String[] { ".", "/", ":" }))
            {
                domainAnalysis da = new domainAnalysis(properUrl);

                return(da.domainName.Replace(".", "_"));
            }
            else
            {
                return(output);
            }
        }
        /// <summary>
        /// Customized code to be executed once the console is started
        /// </summary>
        public override void onStartUp()
        {
            base.onStartUp();

            dataBaseTarget dBT = new dataBaseTarget();



            imbACE.Network.tools.systemKnowledge.prepare(dBT, output);

            domainAnalysis da = new domainAnalysis("http://www.koplas.co.rs");


            // put here your code
        }
 /// <summary>
 /// Loads the domain list.
 /// </summary>
 /// <param name="path">The path.</param>
 /// <param name="options">The options.</param>
 public void LoadDomainList(String path, WebDomainCategoryFormatOptions options = WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate | WebDomainCategoryFormatOptions.normalizeDomainname, ILogBuilder logger = null)
 {
     if (File.Exists(path))
     {
         sites.Clear();
         String[] list = File.ReadAllLines(path);
         foreach (String ln in list)
         {
             String s = ln;
             if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname))
             {
                 domainAnalysis da = new domainAnalysis(s);
                 s = da.urlProper;
             }
             sites.Add(s);
         }
     }
 }
        /// <summary>
        /// Gets the domain list.
        /// </summary>
        /// <param name="options">The options.</param>
        /// <returns></returns>
        public String GetDomainList(WebDomainCategoryFormatOptions options, ILogBuilder logger = null)
        {
            StringBuilder sb = new StringBuilder();

            foreach (String s in sites)
            {
                String ln = s;
                if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname))
                {
                    domainAnalysis da = new domainAnalysis(s);

                    ln = da.urlProper;
                }
                sb.AppendLine(ln);
            }

            return(sb.ToString());
        }
Beispiel #10
0
        /// <summary>
        /// Preuzima opisne podatke iz node/crawledpage objekta -- poziva se iz tokenizatora, ne prenosi sam asdrza
        /// </summary>
        /// <param name="sourcePage"></param>
        public void acceptSourcePage(node sourcePage)
        {
            if (sourcePage != null)
            {
                contentUrl   = sourcePage.url;
                contentTitle = sourcePage.caption.htmlContentProcess();

                if (!domainInfo.isNullOrEmptyString())
                {
                    domainInfo = new domainAnalysis(contentUrl);


                    foreach (string word in domainInfo.domainWords)
                    {
                        contentToken newToken = new contentToken();
                        newToken.content       = word;
                        newToken.sourceContent = domainInfo.domainRootName;
                        newToken.origin        = contentTokenOrigin.domain;
                        headTokens.Add(newToken);
                    }


                    List <string>   words = new List <string>();
                    MatchCollection mchs  = imbStringSelect._select_wordsFromDomainname.Matches(contentTitle);
                    foreach (Match mch in mchs)
                    {
                        words.Add(mch.Value.ToLower());
                    }

                    foreach (string word in words)
                    {
                        contentToken newToken = new contentToken();
                        newToken.content       = word;
                        newToken.sourceContent = contentTitle;
                        newToken.origin        = contentTokenOrigin.title;
                        headTokens.Add(newToken);
                    }
                }
            }
        }
        /// <summary>
        /// Builds or updates web site repositorium using crawling information.
        /// </summary>
        /// <param name="targetCollection">Collection of SpiderTargets, populated by DLC crawl</param>
        /// <param name="domainInfo">DLC domain information</param>
        /// <param name="output">The output.</param>
        /// <returns>
        /// Reference to created or updated web site repository
        /// </returns>
        /// <remarks>
        /// This method uses completed DLC information to create <see cref="imbMCWebSite" /> repository and <see cref="imbMCWebPage" /> for all proper targets
        /// </remarks>
        public imbMCWebSite BuildWebSite(ISpiderTargetCollection targetCollection, domainAnalysis domainInfo, ILogBuilder output = null)
        {
            //Int32 siteCount = siteTable.Count;
            int pageCount = 0;

            imbMCWebSite repo = GetWebSite(domainInfo, true, output);



            pageCount = repo.pageTable.Count;

            if (pageCount == 0)
            {
                loger.log("Web site repository created [" + domainInfo.domainName + "]");
            }

            List <ISpiderTarget> crawledTargets = targetCollection.GetLoaded();

            foreach (ISpiderTarget target in crawledTargets)
            {
                if (isTargetProper(target))
                {
                    BuildWebPage(target, repo);
                }
            }

            int nPageCount = repo.pageTable.Count - pageCount;

            if (nPageCount > 0)
            {
                loger.log("Repository [" + domainInfo.domainName + "] expanded for [" + nPageCount + "] new pages, in total [" + (pageCount + nPageCount) + "] pages.");
            }

            siteTable.AddOrUpdate(repo.entry);

            repo.SaveDataStructure(folder, output);

            return(repo);
        }
        /// <summary>
        /// Gets web site repositorium by clean domain name, like: "koplas.co.rs" for http://www.koplas.co.rs
        /// </summary>
        /// <param name="domainInfo">The domain information.</param>
        /// <param name="autoCreate">if set to <c>true</c> it will automatically create new entry and new repository</param>
        /// <param name="output">The log/diagnostic output</param>
        /// <returns></returns>
        public imbMCWebSite GetWebSite(domainAnalysis domainInfo, bool autoCreate = false, ILogBuilder output = null)
        {
            if (!autoCreate && (!siteTable.ContainsKey(domainInfo.domainRootName)))
            {
                return(null);
            }

            if (output == null)
            {
                output = aceLog.loger;
            }


            imbMCWebSiteEntry entry = siteTable.GetOrCreate(domainInfo.domainRootName);

            entry.domainProperUrl = domainInfo.urlProper;

            imbMCWebSite repo = entry.domain.LoadDataStructure <imbMCWebSite>(folder, output);

            repo.domainInfo = domainInfo;

            repo.deploy(entry);

            entry.domainProperUrl = repo.domainInfo.urlProper;

            siteTable.AddOrUpdate(entry);

            if (repo.folder == null)
            {
                if (output != null)
                {
                    output.log("Warning: folder instance is null in web site repo [" + repo.name + "]");
                }
            }

            return(repo);
        }
Beispiel #13
0
        /// <summary>
        /// Adds any new domains from the source
        /// </summary>
        /// <param name="source">The source.</param>
        /// <returns></returns>
        public int Add(webSiteSimpleSample source, int start = 0, int take = -1)
        {
            int c = 0;
            int i = 0;

            if (take == -1)
            {
                take = source.Count - start;
            }

            foreach (string domain in source.domains)
            {
                if ((i >= start) && (i < (start + take)))
                {
                    domainAnalysis da = new domainAnalysis(domain);
                    if (Add(da.urlProper))
                    {
                        c++;
                    }
                }
                i++;
            }
            return(c);
        }
Beispiel #14
0
        public void StartApp()
        {
            ILogBuilder logger = new  builderForLogBase();

            app = new testApplication();


            Thread t = new Thread(newThread);

            t.Start();


            Thread.Sleep(2000);


            dataBaseTarget dBT = new dataBaseTarget();

            imbACE.Network.tools.systemKnowledge.prepare(dBT, logger);


            domainAnalysis da = new domainAnalysis("http://www.koplas.co.rs");

            Assert.IsNotNull(da.tldDefinition);
        }
Beispiel #15
0
        /// <summary>
        /// Load index information from the external data table source
        /// </summary>
        /// <param name="operation">The operation.</param>
        /// <param name="sourceFile">The source file.</param>
        /// <param name="loger">The loger.</param>
        public void ExecuteIndexPageOperation(indexPageTableOperation operation, string sourceFile, List <string> domainsString = null, ILogBuilder loger = null)
        {
            List <indexPage> pages = new List <indexPage>();

            int i    = 0;
            int c    = 0;
            int imax = pages.Count / 20;

            List <indexDomain> domains = new List <indexDomain>();

            List <string> dList = new List <string>();

            if (domainsString != null)
            {
                foreach (string dm in domainsString)
                {
                    domainAnalysis da = new domainAnalysis(dm);
                    dList.Add(da.domainName);
                }
            }
            domainsString = dList;

            switch (operation)
            {
            default:
                break;

            case indexPageTableOperation.flushNotInSample:

                pages = pageIndexTable.GetList();

                imax = domainsString.Count / 10;

                // domains = domainIndexTable.GetDomains(indexDomainContentEnum.any);

                List <indexPage> newPages = new List <indexPage>();
                foreach (string dm in domainsString)
                {
                    newPages.AddRange((IEnumerable <indexPage>)pageIndexTable.GetPagesForDomain(dm));

                    if (i > imax)
                    {
                        i = 0;
                        loger.log("Selecting pages [" + ((double)c / (double)domainsString.Count).ToString("P2") + "]");
                    }
                    i++;
                    c++;
                }

                pageIndexTable.Clear();
                Save();



                imax = newPages.Count / 20;


                foreach (indexPage page in newPages)
                {
                    pageIndexTable.Add(page);

                    if (i > imax)
                    {
                        i = 0;
                        loger.log("Adding pages [" + ((double)c / (double)newPages.Count).ToString("P2") + "]");
                    }
                    i++;
                    c++;
                }

                domainIndexTable.Clear();

                Save();
                Recheck(loger);

                Publish(new aceAuthorNotation());

                break;

            case indexPageTableOperation.flushNotLoaded:
                pages = pageIndexTable.GetList();
                //domains = domainIndexTable.GetDomains(indexDomainContentEnum.any);

                foreach (indexPage page in pages)
                {
                    if (page.byteSize == 0)
                    {
                        pageIndexTable.Remove(page);
                    }

                    if (i > imax)
                    {
                        i = 0;
                        loger.log("Removing pages from index [" + ((double)c / (double)pages.Count).ToString("P2") + "]");
                    }
                    i++;
                    c++;
                }
                break;

            case indexPageTableOperation.loadReviewedTable:
                ApplyManualPageIndex(sourceFile, loger, true);
                break;
            }
        }
Beispiel #16
0
        /// <summary>
        /// Closes the session.
        /// </summary>
        public void CloseSession(IEnumerable <modelSpiderTestRecord> tRecords)
        {
            if (imbWEMManager.settings.indexEngine.doSaveFailedURLQueries)
            {
                pageIndexTable.ReadOnlyMode = false;

                int i  = 0;
                int c  = 0;
                int ic = pageIndexTable.urlsNotInIndex.Count;
                int ib = ic / 10;
                aceLog.log("Deploying queried URLs that were not in the index (" + ic.ToString() + ")");

                foreach (string url in pageIndexTable.urlsNotInIndex)
                {
                    i++;
                    c++;

                    indexPage page = pageIndexTable.GetPageForUrl(url);
                    page.url = url;

                    domainAnalysis da = new domainAnalysis(url);


                    page.domain = da.domainName;

                    pageIndexTable.AddOrUpdate(page);


                    if (i >= ib)
                    {
                        aceLog.log("URL processed: " + c.GetRatio(ic).ToString("P2") + " (" + c + ")");
                        i = 0;
                    }
                }
            }

            if (indexSessionEntry != null)
            {
                aceLog.log("Saving index engine performance : ... ");


                if (!SKIP_INDEXUPDATE)
                {
                    var das = imbWEMManager.index.domainIndexTable.GetDomainIndexAssertion(null, true);

                    aceLog.log("Saving index engine performance : DomainAssetion done ");

                    indexSessionEntry.Domains         = domainIndexTable.Count;
                    indexSessionEntry.Pages           = pageIndexTable.Count;
                    indexSessionEntry.PagesEvaluated  = pageIndexTable.Where(x => !collectionExtensions.isNullOrEmpty(x.relevancyText)).Count();
                    indexSessionEntry.CrawlerHash     = experimentManager.CurrentSession.state.setupHash_crawler;
                    indexSessionEntry.GlobalSetupHash = experimentManager.CurrentSession.state.setupHash_global;
                    indexSessionEntry.Duration        = DateTime.Now.Subtract(indexSessionEntry.Start).TotalMinutes;

                    aceLog.log("Saving index engine performance : PagesEvaluated counted ");


                    indexSessionEntry.CertainityPP        = das.certainty;
                    indexSessionEntry.MasterTFIDFCoverage = das.masterTFIDFApplied;
                    indexSessionEntry.DomainTFIDFs        = das[indexDomainContentEnum.completeDomainTFIDF].Count;
                }

                aceLog.log("Saving index engine performance : Saving index ");

                if (imbWEMManager.settings.directReportEngine.doPublishIndexPerformanceTable)
                {
                    indexSessionRecords.AddOrUpdate(indexSessionEntry);
                }

                // experimentManager.globalTFIDFSet.GetAggregateDataTable().saveObjectToXML(folder.pathFor(experimentSessionRegistry.PATH_CompiledFTIDF));
            }


            //Publish(imbWEMManager.authorNotation, null);
            //Publish(imbWEMManager.authorNotation, experimentManager.CurrentSession.sessionReportFolder);

            experimentManager.CloseSession(tRecords);
        }
Beispiel #17
0
        /// <summary>
        /// Imports sample from text file
        /// </summary>
        /// <param name="path">path to file with samples, if * it will open dialog to select the file</param>
        /// <param name="inWorkspace">if true, the file path is interpreted as relative to console workspace</param>
        /// <param name="sampleName">Name of the sample list, if empty it will not change current sample list name</param>
        /// <param name="replace">if set to true it will replace any existing samples in the list</param>
        /// <param name="debug">if true it will report on link preprocessing</param>
        /// <remarks>
        /// Loads the file and adds domain urls from it into context's sample list
        /// </remarks>
        /// <seealso cref="aceOperationSetExecutorBase" />
        public void aceOperation_addSampleFile(
            [Description("path to file with samples, if * it will open dialog to select the file")] String path              = "*",
            [Description("if true, the file path is interpreted as relative to console workspace")] Boolean inWorkspace      = true,
            [Description("Name of the sample list, if empty it will not change current sample list name")] String sampleName = "",
            [Description("if set to true it will replace any existing samples in the list")] Boolean replace = false,
            [Description("Number of entries to skip, from the imported file")] Int32 skip = 0,
            [Description("If set above 0, it limits the total number of domains imported")] Int32 limit = -1,
            [Description("if true it will report on link preprocessing")] Boolean debug = true)
        {
            IAceAdvancedConsole console = parent as IAceAdvancedConsole;

            if (path == "*")
            {
                String defPath = appManager.Application.folder_projects.path;
                if (inWorkspace)
                {
                    if (console != null)
                    {
                        defPath = console.workspace.folder.path;
                    }
                }
                path        = dialogs.openSelectFile(imbACE.Services.textBlocks.smart.dialogSelectFileMode.selectFileToOpen, "*.txt", defPath, "Select file to import web domains sample from");
                inWorkspace = false;
            }

            if (Path.IsPathRooted(path))
            {
                inWorkspace = false;
            }

            if (inWorkspace)
            {
                if (console != null)
                {
                    path = console.workspace.folder.pathFor(path);
                }
            }

            if (limit == -1)
            {
                limit = 10000;
            }
            if (skip < 0)
            {
                skip = 0;
            }

            if (File.Exists(path))
            {
                if (replace)
                {
                    context.sampleList = new webSiteSimpleSample();
                }

                if (!sampleName.isNullOrEmpty())
                {
                    context.sampleList.name = sampleName;
                }

                var list = path.openFileToList(true);

                Int32 c = 0;

                foreach (String l in list)
                {
                    domainAnalysis da = new domainAnalysis(l);
                    if (c < skip)
                    {
                        if (debug)
                        {
                            output.Append(String.Format("Skipping {0,-20} => {1,-20}", l, da.urlProper));
                        }
                    }
                    else
                    {
                        if (c >= limit)
                        {
                            break;
                        }

                        if (debug)
                        {
                            output.Append(String.Format("Adding   {0,-20} => {1,-20}", l, da.urlProper));
                        }
                        context.sampleList.Add(da.urlProper);
                    }
                }
            }
            else
            {
                output.log("Sample list file not found at [" + path + "]");
            }
        }
Beispiel #18
0
        /// <summary>
        /// Processes the link into Targets
        /// </summary>
        /// <param name="ln">The ln.</param>
        /// <param name="parentNode">The parent node.</param>
        /// <param name="doLinkResolver">if set to <c>true</c> [do link resolver].</param>
        /// <returns>If new target is created</returns>
        public bool processLink(link ln, spiderPage parentNode, bool doLinkResolver = true)
        {
            bool isNewLink = false;

            #region LINK NORMALIZATION =================================
            if (doLinkResolver)
            {
                ln.url = ln.getAbsoluteUrl(parentNode.webpage);
                ln.url = ln.url.httpsToHttpShema();
                ln.url = ln.url.equalizeUrlWithIndexFilenames();
                ln.url = wRecord.domainInfo.GetResolvedUrl(ln.url, imbWEMManager.settings.linkResolver.LNK_RemoveAnchors);

                try
                {
                    domainAnalysis da = new domainAnalysis(ln.url);

                    if (ln.url.IndexOf(da.domainName) > -1)
                    {
                        int l = ln.url.Length - (ln.url.IndexOf(da.domainName) + da.domainName.Length);
                        if (l == 1)
                        {
                            ln.url = da.urlProper;
                        }
                    }
                } catch (Exception ex)
                {
                    imbWEMManager.log.log("Process link exception: " + ex.Message);
                }
            }
            #endregion ========================================================


            spiderLink sln = new spiderLink(parentNode, ln, wRecord.iteration); // <------------------------------------------------------------ upisuje referencu porekla: stranica, link i iteracija

            if (!spider.approveUrl(sln.link))
            {
                sln.flags |= spiderLinkFlags.urlNotSupported; // <---------------------------------------------------------------------- ako link nije poželjan / dozvoljen
            }
            else
            {
                spiderTarget target = targets.GetByTarget(sln);


                if (wRecord.web.webLinks.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkVector;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkVector;
                }


                if (wRecord.web.webTargets.Add(sln))
                {
                    sln.flags |= spiderLinkFlags.newlinkTarget;
                }
                else
                {
                    sln.flags |= spiderLinkFlags.oldlinkTarget;
                }

                if (sln.flags.HasFlag(spiderLinkFlags.newlinkTarget) || (target == null))
                {
                    if (target == null)
                    {
                        isNewLink = true;
                        target    = targets.GetOrCreateTarget(sln, true, true);
                        wRecord.web.webActiveLinks.Add(sln);
                    }
                    else
                    {
                        isNewLink = false;
                    }
                    // <----------------------------------------------------------------------- upisuje u spisak aktivnih linkova
                }
            }

            return(isNewLink);
        }