public override void eventUniversal(crawlReportingStageEnum stage, directReporterBase __parent, crawlerDomainTask __task, modelSpiderSiteRecord wRecord) { switch (stage) { case crawlReportingStageEnum.DLCPreinitiation: wRecord.context.OnTargetPageAttached += new modelSpiderSiteRecordEvent(onTargetPageAttached); imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = mcRepo.GetWebSite(wRecord.domainInfo, true, loger); if (!webSiteReposByDomain.ContainsKey(wRecord.domain)) { webSiteReposByDomain.Add(wRecord.domain, wRepo); } else { loger.log("DLC sent to CrawlToMC plugin second time: " + wRecord.domain); } mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); break; } }
public override void eventDLCFinished(directReporterBase __spider, crawlerDomainTask __task, modelSpiderSiteRecord __wRecord) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; mcRepo.siteTable.AddOrUpdate(wRepo.entry); wRepo.SaveDataStructure(mcRepo.folder, loger); }
private void onTargetPageAttached(modelSpiderSiteRecord __wRecord, modelSpiderSiteRecordEventArgs __args) { imbMCRepository mcRepo = mcm.activeRepository; imbMCWebSite wRepo = webSiteReposByDomain[__wRecord.domain]; ISpiderTarget target = __args.Target; if (mcRepo.isTargetProper(target)) { imbMCWebPage pRepo = mcRepo.BuildWebPage(target, wRepo, loger); pRepo.indexEntry = imbWEMManager.index.pageIndexTable.GetPageForUrl(target.url); pRepo.HtmlSourceCode = __args.sourceHtml; pRepo.XmlSourceCode = __args.sourceXml; pRepo.SaveDataStructure(wRepo.folder, loger); } }
/// <summary>Opens or creates new MCRepository, starts a MC session. Call this before any other MC operation.</summary> /// <remarks><para>It initiates specified MCRepository and sets it as current/selected.</para></remarks> /// <param name="repo">Name of repository to start work with</param> /// <param name="log_msg">A message to be written into repository log after it is initiated, e.g. adding new MCWebSites, or running Data Mining procedure XXXX</param> /// <param name="debug">If true, it will print out short report on content of the repository (if any)</param> /// <seealso cref="aceOperationSetExecutorBase"/> public void aceOperation_mcOpen( [Description("Name of repository to start working with")] string repo = "word", [Description("A message to be written into repository log after it is initiated, e.g. adding new MCWebSites, or running Data Mining procedure XXXX")] string log_msg = "", [Description("If true, it will print out short report on content of the repository (if any)")] bool debug = false) { imbMCRepository instance = null; string path = folder.pathFor("\\" + repo); if (Directory.Exists(path)) { instance = repo.LoadDataStructure <imbMCRepository>(folder, output); instance.loger.log("Repository loaded ".add(log_msg, ". ")); } else { string descriptionForNew = "MC Repository created [" + DateTime.Now.ToLongDateString() + " " + DateTime.Now.ToLongTimeString() + "]. " + log_msg; instance = new imbMCRepository(repo, descriptionForNew, folder); instance.loger.log("Repository created ".add(log_msg, ". ")); } if (debug) { instance.debugReport(output); } if (instance != null) { output.log("MC Repository [" + repo + "] initiated"); activeRepository = instance; } else { output.log("MC Repository [" + repo + "] failed to initiate"); } }
/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>; if (realTask == null) { return(next); } pipelineTaskMCRepoSubject realSubject = realTask.subject; folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content"); imbMCRepository repo = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger); imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium(); docRepo.webRepository = repo; realSubject.mcElement = docRepo; realSubject.MCRepo = repo; List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup); List <imbMCWebSite> ws = new List <imbMCWebSite>(); //try //{ // repo.siteTable.Clear(); // repo.CheckSiteTable(task.context.logger); // if (realSubject.WebSiteSample.Any()) // { // foreach (String w in realSubject.WebSiteSample) // { // var iws = websites.FirstOrDefault(x => w.Contains(x.name)); //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger); // if (iws != null) // { // task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]"); // websites.Add(iws); // } // else // { // task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]"); // } // } // } // else // { // } //} catch (Exception ex) //{ // throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message); //} if (!websites.Any()) { task.context.logger.log(this.name + " Failed --- no web sites loaded"); } else { } List <String> needle = new List <string>(); realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x))); List <String> urls = new List <string>(); foreach (imbMCWebSite site in websites) { String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper); Boolean ok = true; if (realSubject.MCSiteTargets.Any()) { if (!needle.Contains(sName)) { ok = false; #if DEBUG //Console.WriteLine("Site refused [" + sName + "]"); #endif } } if (urls.Contains(sName)) { ok = false; } if (ok) { pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject(); mCSiteSubject.MCSite = site; imbMCDocumentSet docSet = new imbMCDocumentSet(); docRepo.Add(docSet); mCSiteSubject.mcElement = docSet; mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite; mCSiteSubject.name = sName; mCSiteSubject.parent = realSubject; realSubject.Add(mCSiteSubject); urls.Add(mCSiteSubject.name); pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject); task.context.scheduledTasks.Push(taskForSite); } } if (urls.Count < needle.Count) { urls.ForEach(x => needle.Remove(x)); if (needle.Any()) { String nd = ""; needle.ForEach(x => nd += x + " "); throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this); } } return(forward); }