/// <summary> /// Builds the cloud - common part of the algorithm /// </summary> /// <param name="c">The c.</param> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception> protected lemmaSemanticCloud BuildCloud(lemmaSemanticConstruct c, webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, ITextResourceResolver resolver) { c.TrashBin.ForEach(x => c.nodeNames.Remove(x)); foreach (String n in c.nodeNames) // <------------ creating nodes { Double weight = 0; if (settings.assignTermTableWeightToNode) { var lemma = termTable[n]; if (lemma != null) { weight = lemma.weight; } } else { weight = 1; } if (weight > 0) { if (c.isCaseCloud) { if (settings.doFactorToCaseClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } else { // class cloud if (settings.doFactorToClassClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } } } foreach (List <String> n in c.lemmasList) // <-------- creating links { String first = n[0]; if (c.TrashBin.Contains(first)) { continue; } if (output.ContainsNode(first, true)) { foreach (String m in n) { if (c.TrashBin.Contains(m)) { continue; } if (m != first) { if (output.ContainsNode(m, true)) { Double weight = 1; if (settings.assignChunkTableWeightToLink) { weight = c.weightDict[n].weight; } else { if (settings.doAdjustLinkWeightByChunkSize) { weight = (n.Count - 1).GetRatio(1); } else { weight = 1; } } var link = output.GetLink(first, m); if (link == null) { output.AddLink(first, m, weight); } else { if (settings.doSumExistingLinkWeights) { link.weight += weight; } else { // it will not create new link as it already exists // this is irrational in case settings.assignChunkTableWeightToLink is true if (settings.assignChunkTableWeightToLink) { throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings); } } } } } } } } c.primaryChunks.ForEach(x => output.primaryChunks.Add(x.nominalForm)); c.secondaryChunks.ForEach(x => output.secondaryChunks.Add(x.nominalForm)); return(output); }
/// <summary> /// Processes the position enhanced. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { List <webLemmaTerm> allChunks = chunkTable.GetList(); if (output == null) { output = new lemmaSemanticCloud(); } // <----------------- PRIMARY lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects); while (c != cl) { c = cl; cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger); if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit) { c = cl; break; } if (cl.OptimizationDone) { break; } } c = cl; // <------------------- PRIM c.CollectRelevantTerms(settings.doReserveTermsForClass); if (!c.isCaseCloud) { c.LogConstruct(logger); } // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); List <String> l_out = new List <string>(); foreach (String lm in lemmas) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.ReserveTerms.AddUnique(lm); l_out.Add(lm); } else { c.TrashBin.AddUnique(lm); } } } else { if (!c.TrashBin.Contains(lm)) { l_out.Add(lm); } } } if (l_out.Count > 1) { l_out.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(l_out); c.weightDict.Add(l_out, chunk); c.nodeNames.AddRange(l_out, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Processes the complex. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); } lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); List <webLemmaTerm> allChunks = chunkTable.GetList(); // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS IEnumerable <webLemmaTerm> vipChunks = null; if (subjects.Count > 1) { vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { vipChunks = allChunks; } instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in vipChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); lemmaCounter.AddInstanceRange(lemmas); } c.RelevantTerms = lemmaCounter.getSorted(); lemmaCounter.reCalculate(); foreach (String term in c.RelevantTerms) { if (lemmaCounter[term] == lemmaCounter.maxFreq) { c.PrimaryTerms.Add(term); } else if (lemmaCounter[term] > lemmaCounter.minFreq) { c.SecondaryTerms.Add(term); } else { c.ReserveTerms.Add(term); } } c.CollectRelevantTerms(settings.doReserveTermsForClass); c.LogConstruct(logger); // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(lemmas); c.weightDict.Add(lemmas, chunk); c.nodeNames.AddRange(lemmas, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
public static lemmaSemanticConstruct NextIteration(lemmaSemanticConstruct lastIteration, ITextResourceResolver resolver, List <webLemmaTerm> allChunks, cloudConstructorSettings settings, List <pipelineTaskMCSiteSubject> subjects, ILogBuilder logger) { var cl = lastIteration; var c = new lemmaSemanticConstruct(subjects); c.createdInIteration = lastIteration.createdInIteration + 1; c.PTCountMin = Math.Min(lastIteration.PTCountMin, lastIteration.PrimaryTerms.Count); c.PTCountMax = Math.Max(lastIteration.PTCountMax, lastIteration.PrimaryTerms.Count); if (!c.isCaseCloud) { c.onTopChunks.AddRange(allChunks.Where(x => x.documentSetFrequency > (settings.documentSetFreqLowLimit + lastIteration.createdInIteration))); } else { if (!settings.doFactorToCaseClouds) { c.OptimizationDone = true; } c.onTopChunks = allChunks; } if (!c.isCaseCloud) { instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in c.onTopChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmaCounter.AddInstanceRange(lemmas); } lemmaCounter.reCalculate(); foreach (String st in lemmaCounter) { if (lemmaCounter.maxFreq == 1 || lemmaCounter[st] > 1) { var lu = resolver.GetLexicUnit(st, logger); if (lu == null) { c.TrashBin.AddUnique(st); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.Contains(pos_type.N)) { c.PrimaryTerms.AddUnique(st); } else if (tg.Contains(pos_type.A)) { c.SecondaryTerms.AddUnique(st); } else { c.TrashBin.AddUnique(st); } } } } ; // <---------------------------- Primary terms extracted if (c.PrimaryTerms.Count == 0) { if (c.SecondaryTerms.Any()) { logger.log(":: Moving Adjective terms [" + c.SecondaryTerms.Count + "] to Primary Terms category, as no Nouns were qualified to the cateogry"); c.PrimaryTerms.AddRange(c.SecondaryTerms); c.SecondaryTerms.Clear(); } } } instanceCountCollection <String> secondCounter = new instanceCountCollection <string>(); foreach (webLemmaTerm chunk in allChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); secondCounter.AddInstanceRange(lemmas); } foreach (webLemmaTerm chunk in allChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); if (lemmas.ContainsAny(c.PrimaryTerms)) { if (c.onTopChunks.Contains(chunk)) { c.primaryChunks.Add(chunk); } else { c.secondaryChunks.Add(chunk); } foreach (String lm in lemmas) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.SecondaryTerms.AddUnique(lm); } else { c.TrashBin.AddUnique(lm); } } } } } else { foreach (String lm in lemmas) { if (secondCounter[lm] > settings.termInChunkLowerLimit) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.ReserveTerms.AddUnique(lm); } else { c.TrashBin.AddUnique(lm); } } } } else { c.TrashBin.AddUnique(lm); } } } } if (c.OptimizationDone) { return(c); } c.PTCountMin = Math.Min(lastIteration.PTCountMin, c.PrimaryTerms.Count); c.PTCountMax = Math.Max(lastIteration.PTCountMax, c.PrimaryTerms.Count); if (c.PrimaryTerms.Count <= settings.primaryTermLowTargetCount) { if (lastIteration.PrimaryTerms.Count < c.PrimaryTerms.Count) { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count [" + c.PrimaryTerms.Count + "] after [" + c.createdInIteration + "] iterations optimized ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); } else { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "] --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); logger.log("[" + c.createdInIteration.ToString("D3") + "] previous PrimaryTerms count [" + lastIteration.PrimaryTerms.Count + "] accepted, after [" + c.createdInIteration + "] ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); c = lastIteration; } c.OptimizationDone = true; } else { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "] --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); } return(c); }