protected override List <pipelineTaskSubjectContentToken> processIteration(pipelineTaskSubjectContentToken streamSubject) { List <pipelineTaskSubjectContentToken> chunks = new List <pipelineTaskSubjectContentToken>(); if (streamSubject.contentLevelType == cnt_level.mcChunk) { chunks.Add(streamSubject); } foreach (imbSCI.Data.interfaces.IObjectWithPathAndChildren chk in streamSubject) { pipelineTaskSubjectContentToken ch = chk as pipelineTaskSubjectContentToken; if (ch.contentLevelType == cnt_level.mcChunk) { } } var subchk = pipelineSubjectTools.GetSubjectChildrenTokenType <pipelineTaskSubjectContentToken, IGraphNode>(streamSubject, new cnt_level[] { cnt_level.mcChunk }, true); chunks.AddRange(subchk); imbMCDocumentElement stream = streamSubject.mcElement as imbMCDocumentElement; if (stream == null) { return(chunks); } subjectRenderLayers layers = new subjectRenderLayers(); foreach (chunkMatchRule rule in settings.rules) { textMap <pipelineTaskSubjectContentToken> typeTagFormMap = layers.render(streamSubject, rule.renderMode); MatchCollection mchs = rule.regex.Matches(typeTagFormMap.render); List <List <pipelineTaskSubjectContentToken> > mchs_s = typeTagFormMap.Select(mchs); List <List <pipelineTaskSubjectContentToken> > mchs_subjects = new List <List <pipelineTaskSubjectContentToken> >(); foreach (List <pipelineTaskSubjectContentToken> mg in mchs_s) { List <pipelineTaskSubjectContentToken> mgc = new List <pipelineTaskSubjectContentToken>(); foreach (pipelineTaskSubjectContentToken m in mg) { if (rule.contentLevel.Contains(m.contentLevelType)) { mgc.Add(m); } } if (mgc.Any()) { mchs_subjects.Add(mgc); } } foreach (List <pipelineTaskSubjectContentToken> mGroup in mchs_subjects) { String tkn = imbStringGenerators.getRandomString(4); Boolean createChunk = true; foreach (pipelineTaskSubjectContentToken s in mGroup) { tkn = tkn + s.name; } if (rule.flagTypesToMatch.Any()) { Dictionary <Type, Object> flags = new Dictionary <Type, object>(); foreach (pipelineTaskSubjectContentToken s in mGroup) { foreach (Type flagType in rule.flagTypesToMatch) { Object fl = s.flagBag.FirstOrDefault(x => x.GetType() == flagType); if (fl == null) { continue; } if (!flags.ContainsKey(flagType)) { if (fl != null) { flags.Add(flagType, fl); } continue; } if (settings.doCheckGramTagCriteria) { if (flags[flagType] != fl) { createChunk = false; break; } } } if (createChunk == false) { break; } } } if (createChunk) { pipelineTaskSubjectContentToken chunkSubject = new pipelineTaskSubjectContentToken(); chunkSubject.name = tkn; streamSubject.Add(chunkSubject); imbMCChunk chunk = new imbMCChunk(); chunk.name = tkn; chunkSubject.contentLevelType = flags.token.cnt_level.mcChunk; chunkSubject.flagBag.Add(rule.chunkType); chunkSubject.mcElement = chunk; if (stream != null) { chunk.htmlNode = stream.htmlNode; } chunk.position = mGroup.Min(x => x.mcElement.position); stream.Add(chunk); List <Object> commonFlags = new List <object>(); List <Object> forbidenFlags = new List <object>(); Boolean isFirstSubject = true; foreach (pipelineTaskSubjectContentToken s in mGroup) { s.mcElement.removeFromParent(); chunk.Add(s.mcElement); s.removeFromParent(); chunkSubject.Add(s); if (isFirstSubject) { commonFlags.AddRange(s.flagBag, true); isFirstSubject = false; } else { foreach (Object flag in s.flagBag) { if (!forbidenFlags.Contains(flag)) { if (!commonFlags.Contains(flag)) { forbidenFlags.AddUnique(flag); } } else { commonFlags.Remove(flag); } } } } chunkSubject.initialForm = chunkSubject.render(contentTokenSubjectRenderMode.currentForm).render; var lemmaForm = chunkSubject.render(contentTokenSubjectRenderMode.lemmaForm); chunkSubject.currentForm = lemmaForm.GetCleanRender(); chunk.content = chunkSubject.currentForm; chunkSubject.flagBag.Clear(); chunkSubject.flagBag.AddRange(commonFlags, true); chunks.Add(chunkSubject); } } } return(chunks); }
/// <summary> /// Constructs the webLemmaTable /// </summary> /// <param name="tableName">Name of the table.</param> /// <param name="parser">The parser.</param> /// <param name="counter">The counter.</param> /// <param name="logger">The logger.</param> /// <returns></returns> protected webLemmaTermTable process(String tableName, ITextResourceResolver parser, TFDFCounter counter, ILogBuilder logger, webLemmaTermTable table = null, Boolean forSingleWebSite = false) { if (table == null) { table = new webLemmaTermTable(tableName); } if (table.Count > 0) { logger.log("THIS TABLE " + tableName + " ALREADY HAS [" + table.Count + "] ITEMS --- HALTING BUILD [For single web site: " + forSingleWebSite + "]"); if (DoBeep == 1) { imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 250); Interlocked.Increment(ref DoBeep); } return(table); } List <String> tfdfList = counter.GetIndexForms(); Int32 i = 0; Int32 c = 0; Int32 li = 0; Int32 limit = tfdfList.Count + 100; if (!tableName.isNullOrEmpty()) { table.name = tableName; } List <webLemmaTerm> lemmas = new List <webLemmaTerm>(); Int32 startIndex = tfdfList.Count; Int32 cycleLength = startIndex / 5; while (tfdfList.Any()) { String term = tfdfList.FirstOrDefault(); Int32 d = tfdfList.Count; if (term != null) { lexicGraphSetWithLemma inflectSet = parser.GetLemmaSetForInflection(term, tfdfList, logger); d = d - tfdfList.Count; if (d == 0) { table.unresolved.Add(term); tfdfList.Remove(term); d = 1; } else { Boolean ok = true; if (settings.allowedLemmaTypes.Any()) { var tps = inflectSet.GetTagsFromGramTags <pos_type>(pos_type.none); if (settings.strictPosTypePolicy) { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { if (tps.Contains(pos_type.V)) { ok = false; } //foreach (pos_type t in tps) //{ // if (!settings.allowedLemmaTypes.Contains(t)) // { // ok = false; // break; // } //} } } else { if (!tps.ContainsAny(settings.allowedLemmaTypes)) { ok = false; } else { } } } else { } if (ok) { List <imbMCDocumentElement> documents = new List <imbMCDocumentElement>(); List <imbMCDocumentElement> documentSet = new List <imbMCDocumentElement>(); webLemmaTerm lemma = new webLemmaTerm(); lemma.nominalForm = inflectSet.lemmaForm; lemma.name = inflectSet.lemmaForm; Double documentFrequency = 0; Double termFrequency = 0; foreach (lexicInflection inflect in inflectSet.Values) { TFDFContainer cn = counter.GetContainer(inflect.inflectedForm); if (cn != null) { lemma.AFreqPoints += cn.items.Count; foreach (pipelineTaskSubjectContentToken cntPair in cn.items) { imbMCDocument document = cntPair.mcElement.GetParentOfType <imbMCDocument>(); documents.AddUnique(document); imbMCDocumentElement docSet = document?.parent as imbMCDocumentElement; if (docSet != null) { documentSet.AddUnique(docSet); } else { logger.log(cn.indexForm + " (" + cntPair.mcElement.toStringSafe("mcElement=null") + ")"); } if (cntPair.flagBag.Contains(cnt_containerType.link)) { termFrequency += settings.anchorTextFactor; } else if (cntPair.flagBag.Contains(cnt_containerType.title)) { termFrequency += settings.titleTextFactor; } else { termFrequency += settings.contentTextFactor; } cntPair.AddGraph(inflect); } lemma.otherForms.AddUnique(cn.indexForm); } else { lemma.otherForms.AddUnique(inflect.inflectedForm); } } lemma.documentSetFrequency = documentSet.Count; lemma.documentFrequency = documents.Count; lemma.termFrequency = termFrequency; lemmas.Add(lemma); //table.Add(lemma); } else { } } } li++; i = i + d; c = c + d; d = startIndex - tfdfList.Count; if (c > cycleLength) { c = 0; logger.AppendLine(); logger.log("TF-IDF processed: _" + d.GetRatio(startIndex).ToString("P2") + "_"); logger.AppendLine(); } if (li > limit) { logger.log("Limit broken at processing WEB Lemma Frequency table at [" + li.ToString() + "]"); break; } } if (settings.doComputeTFIDF) { recompute(table, logger, forSingleWebSite, lemmas); } else { foreach (var le in lemmas) { table.Add(le); } } // table.ReadOnlyMode = true; return(table); }