//internal static String RenderRegexPosTypePattern() /// <summary> /// Renders the open. /// </summary> /// <param name="level">The level.</param> /// <returns></returns> public static String renderOpen(this cnt_level level, contentTokenSubjectRenderMode mode) { switch (level) { case flags.token.cnt_level.mcChunk: switch (mode) { case contentTokenSubjectRenderMode.currentForm: case contentTokenSubjectRenderMode.lemmaForm: return(" "); break; } return("{"); break; default: case flags.token.cnt_level.mcToken: return(""); break; case flags.token.cnt_level.mcBlock: return(Environment.NewLine); break; case flags.token.cnt_level.mcTokenStream: return(Environment.NewLine); break; } }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="chunks">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> chunks, cnt_level document_level, webLemmaTermTable table, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false) { // List<pipelineTaskMCPageSubject> MCPageSubjects = new List<pipelineTaskMCPageSubject>(); TFDFCounter counter = new TFDFCounter(); var listChunks = chunks.ToList(); listChunks.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); foreach (pipelineTaskSubjectContentToken mcSubject in listChunks) { //var page = mcSubject.GetParentOfType<pipelineTaskMCPageSubject>(); //if (!MCPageSubjects.Contains(page)) //{ // MCPageSubjects.Add(page); // counter.NextDocument(); //} if (mcSubject.contentLevelType == cnt_level.mcChunk) { counter.Add(mcSubject.currentForm, mcSubject); } } return(process(counter, logger, table, forSingleWebSite)); }
/// <summary> /// Gets the subjects of level. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="source">The source.</param> /// <param name="level">The level.</param> /// <returns></returns> public static List <T> GetSubjectsOfLevel <T>(this IEnumerable <T> source, cnt_level level) where T : IPipelineTaskSubject { List <T> MCStreams = new List <T>(); foreach (T mcSubject in source) { if (mcSubject.contentLevelType == level) { MCStreams.AddUnique(mcSubject); } } return(MCStreams); }
/// <summary> /// Runs the specified model. /// </summary> /// <param name="__model">The model.</param> /// <param name="paramsForPrimaryTasks">The parameters for primary tasks creation, passed to the model</param> /// <returns></returns> public pipelineModelExecutionContext run(IPipelineModel __model, params Object[] paramsForPrimaryTasks) { var output = new pipelineModelExecutionContext(__model); var primTasks = __model.createPrimaryTasks(paramsForPrimaryTasks); foreach (var pTask in primTasks) { output.scheduledTasks.Push(pTask); } statusExplain(); Task runMasterTask = new Task(() => { runSeparate(output); }); machineRunning = true; statusUpdate(output); runMasterTask.Start(); while (machineRunning) { if (output.GetSinceLastStatusUpdate() > settings.StatusReportPeriod) { output.lastStatusUpdate = DateTime.Now; statusUpdate(output); } Thread.Sleep(settings.TickForCheck); if (output.scheduledTasks.Count == 0 && output.lastTakeSize == 0) { machineRunning = false; statusUpdate(output); } } logger.log("Sorting exit bin into dictionaries (by type and content level)"); foreach (var item in output.exitSubjects) { Type t = item.GetType(); if (!output.exitByType.ContainsKey(t)) { output.exitByType.Add(t, new ConcurrentBag <IPipelineTaskSubject>()); } cnt_level level = item.contentLevelType; if (!output.exitByLevel.ContainsKey(level)) { output.exitByLevel.Add(level, new ConcurrentBag <IPipelineTaskSubject>()); } output.exitByType[t].Add(item); output.exitByLevel[level].Add(item); } logger.log("Exit bin sorted by [" + output.exitByType.Count + "] types and [" + output.exitByLevel.Count + "] levels"); //imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1); return(output); }
/// <summary> /// Processes the specified source. /// </summary> /// <param name="source">The source.</param> /// <param name="document_level">The document level.</param> /// <param name="table">The table.</param> /// <param name="parser">The parser.</param> /// <param name="logger">The logger.</param> /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param> /// <param name="counter">The counter.</param> /// <returns></returns> public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, bool forSingleWebSite = false, TFDFCounter counter = null) { if (counter == null) { counter = prepareCounter(source); } List <String> tfdfList = counter.GetIndexForms(); tfdfList.Sort(String.CompareOrdinal); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count() + 500; List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count(); Int32 cycleLength = startIndex / 5; foreach (String term in tfdfList) { if (term != null) { List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); Double documentFrequency = 0; Double termFrequency = 0; TFDFContainer cn = counter.GetContainer(term); webLemmaTerm lemma = new webLemmaTerm(); if (cn != null) { lemma.nominalForm = cn.indexForm; lemma.name = cn.indexForm; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair?.mcElement?.GetParentOfType <imbMCDocument>(); if (document != null) { documents.AddUnique(document); imbMCDocumentSet docSet = document?.parent as imbMCDocumentSet; if (docSet != null) { documentSet.AddUnique(docSet); } } termFrequency += 1; } lemma.documentSetFrequency = documentSet.Count; lemma.AFreqPoints = cn.items.Count(); lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); } else { //lemma.otherForms.AddUnique(cn.items); } li++; i = i + 1; c = c + 1; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("Token Streams TF processing: _" + i.GetRatio(startIndex).ToString("P2") + "_ "); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing Token Streams TF processing at [" + li.ToString() + "]"); break; } } } recompute(table, logger, forSingleWebSite, lemmas); return(table); }
//process(IEnumerable<IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table, ILogBuilder logger = null, Boolean forSingleWebSite = false)* public webLemmaTermTable process(IEnumerable <IPipelineTaskSubject> source, cnt_level document_level, webLemmaTermTable table = null, ITextResourceResolver parser = null, ILogBuilder logger = null, Boolean forSingleWebSite = false) { if (table.Count > 0) { logger.log("THIS TABLE " + table.name + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } TFDFCounter counter = new TFDFCounter(); lock (getAllChildrenLock) { var listSource = source.ToList(); // listSource.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); source = listSource; } List <IPipelineTaskSubject> rkns = source.GetSubjectsOfLevel <IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcToken }); // source.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>(); rkns.Sort((x, y) => String.CompareOrdinal(x.currentForm, y.currentForm)); //var tkns = source.GetSubjectsOfLevel(cnt_level.mcToken); Int32 shorties = 0; foreach (var tkn in rkns) { //if (tkn.currentForm.Length > 1) //{ if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters)) { counter.Add(tkn.currentForm.ToLower(), tkn); } //} else //{ // shorties++; //} } if (shorties > 0) { logger.log("[" + shorties + "] too short tokens removed"); } /* * * List<pipelineTaskSubjectContentToken> MCPageSubjects = source.ToSubjectTokenType<pipelineTaskSubjectContentToken>(); * * * * foreach (pipelineTaskSubjectContentToken mcSubject in MCPageSubjects) * { * counter.NextDocument(); * List<pipelineTaskSubjectContentToken> tkns = new List<pipelineTaskSubjectContentToken>(); * lock (getAllChildrenLock) * { * tkns = mcSubject.getAllChildrenInType<pipelineTaskSubjectContentToken>(null, false, false).GetSubjectsOfLevel(cnt_level.mcToken); * } * foreach (var tkn in tkns) * { * if (tkn.flagBag.ContainsAll(tkn_contains.onlyLetters)) * { * counter.Add(tkn.currentForm.ToLower(), tkn); * } * } * } */ return(process(table.name, parser, counter, logger, table, forSingleWebSite)); }