/// <summary> /// Task builder for <see cref="imbMCRepository"/> level of subject. Sends to next if task is not with <see cref="pipelineTaskMCRepoSubject"/> /// </summary> /// <param name="task">The task.</param> /// <returns></returns> public override IPipelineNode process(IPipelineTask task) { pipelineTask <pipelineTaskMCRepoSubject> realTask = task as pipelineTask <pipelineTaskMCRepoSubject>; if (realTask == null) { return(next); } pipelineTaskMCRepoSubject realSubject = realTask.subject; folderNode repoFolder = appManager.Application.folder_resources.Add(imbMCManager.MCRepo_DefaultDirectoryName, "MC Repositories", "Root directory with repositories of Crawled Web content"); imbMCRepository repo = realSubject.MCRepoName.LoadDataStructure <imbMCRepository>(repoFolder, task.context.logger); imbMCDocumentRepositorium docRepo = new imbMCDocumentRepositorium(); docRepo.webRepository = repo; realSubject.mcElement = docRepo; realSubject.MCRepo = repo; List <imbMCWebSite> websites = repo.GetAllWebSites(task.context.logger, takeSetup); List <imbMCWebSite> ws = new List <imbMCWebSite>(); //try //{ // repo.siteTable.Clear(); // repo.CheckSiteTable(task.context.logger); // if (realSubject.WebSiteSample.Any()) // { // foreach (String w in realSubject.WebSiteSample) // { // var iws = websites.FirstOrDefault(x => w.Contains(x.name)); //repo.GetWebSite(new domainAnalysis(w), false, task.context.logger); // if (iws != null) // { // task.context.logger.log(this.name + " Web site [ _" + w + "_ ] added to the pipeline: [" + repo.name + "]"); // websites.Add(iws); // } // else // { // task.context.logger.log(this.name + " Web site [ _" + w + "_] not found in the repo: [" + repo.name + "]"); // } // } // } // else // { // } //} catch (Exception ex) //{ // throw new aceGeneralException("Failed to recover web sites from the repository", ex, this, "Failed to load sites from repository: " + ex.Message); //} if (!websites.Any()) { task.context.logger.log(this.name + " Failed --- no web sites loaded"); } else { } List <String> needle = new List <string>(); realSubject.MCSiteTargets.ForEach(x => needle.Add(pipelineSubjectTools.GetCleanCaseName(x))); List <String> urls = new List <string>(); foreach (imbMCWebSite site in websites) { String sName = pipelineSubjectTools.GetCleanCaseName(site.domainInfo.urlProper); Boolean ok = true; if (realSubject.MCSiteTargets.Any()) { if (!needle.Contains(sName)) { ok = false; #if DEBUG //Console.WriteLine("Site refused [" + sName + "]"); #endif } } if (urls.Contains(sName)) { ok = false; } if (ok) { pipelineTaskMCSiteSubject mCSiteSubject = new pipelineTaskMCSiteSubject(); mCSiteSubject.MCSite = site; imbMCDocumentSet docSet = new imbMCDocumentSet(); docRepo.Add(docSet); mCSiteSubject.mcElement = docSet; mCSiteSubject.contentLevelType = flags.token.cnt_level.mcSite; mCSiteSubject.name = sName; mCSiteSubject.parent = realSubject; realSubject.Add(mCSiteSubject); urls.Add(mCSiteSubject.name); pipelineTask <pipelineTaskMCSiteSubject> taskForSite = new pipelineTask <pipelineTaskMCSiteSubject>(mCSiteSubject); task.context.scheduledTasks.Push(taskForSite); } } if (urls.Count < needle.Count) { urls.ForEach(x => needle.Remove(x)); if (needle.Any()) { String nd = ""; needle.ForEach(x => nd += x + " "); throw new aceScienceException("Some sites are not found in the MC Repository!! [" + nd + "]", null, realSubject, "Sites not loaded [" + nd + "]", this); } } return(forward); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <param name="table">The table.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <returns></returns> public webLemmaTermTable process(TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table, Boolean forSingleWebSite = false) { List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; /* * if (cntPair.flagBag.Contains(cnt_containerType.link)) * { * termFrequency += settings.anchorTextFactor; * } * else if (cntPair.flagBag.Contains(cnt_containerType.title)) * { * termFrequency += settings.titleTextFactor; * } * else * { * termFrequency += settings.contentTextFactor; * }*/ // lemma.otherForms.AddUnique(cntPair.initialForm); } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Chunk TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Chunk Lemma Frequency table at [" + li.ToString() + "]"); break; } } } // table.WriteOnlyMode = false; recompute(table, logger, forSingleWebSite, lemmas); // table.ReadOnlyMode = true; return(table); }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="source">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <param name="counter">The counter.</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null) { if (counter == null) { counter = prepareCounter(source); } List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]"); break; } } } recompute(table, logger, forSingleWebSite, lemmas); return(table); }