/// <summary> /// Processes the specified chunk table into semantic cloud /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception> public lemmaSemanticCloud process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); output.className = termTable.name; } switch (settings.algorithm) { case cloudConstructorAlgorithm.complex: output = processPOSEnhanced(chunkTable, termTable, output, logger, subjects, resolver); break; case cloudConstructorAlgorithm.standard: output = processStandard(chunkTable, termTable, output, logger, subjects); break; case cloudConstructorAlgorithm.alternative: output = processAlternative(chunkTable, termTable, output, logger, subjects, resolver); break; } output.RebuildIndex(); output.weaverReport = settings.cloudWeaver.Process(output, logger); output.RebuildIndex(); return(output); }
protected lemmaSemanticCloud processStandard(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects) { if (output == null) { output = new lemmaSemanticCloud(); } List <webLemmaTerm> allChunks = chunkTable.GetList(); IEnumerable <webLemmaTerm> docSetFreq = null; if (subjects.Count > 1) { docSetFreq = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { docSetFreq = allChunks; } // allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); Dictionary <List <String>, webLemmaTerm> weightDict = new Dictionary <List <String>, webLemmaTerm>(); List <List <String> > lemmasList = new List <List <string> >(); List <String> nodeNames = new List <string>(); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); lemmasList.Add(lemmas); weightDict.Add(lemmas, chunk); nodeNames.AddRange(lemmas, true); } } foreach (String n in nodeNames) // <------------ creating nodes { Double weight = 0; if (settings.assignTermTableWeightToNode) { weight = termTable.ResolveSingleTerm(n, logger); } else { weight = 1; } if (weight > 0) { output.AddNode(n, weight); } } foreach (List <String> n in lemmasList) // <-------- creating links { String first = n[0]; if (output.ContainsNode(first, true)) { foreach (String m in n) { if (m != first) { if (output.ContainsNode(m, true)) { Double weight = 1; if (settings.assignChunkTableWeightToLink) { weight = weightDict[n].weight; } else { if (settings.doAdjustLinkWeightByChunkSize) { weight = (n.Count - 1).GetRatio(1); } else { weight = 1; } } var link = output.GetLink(first, m); if (link == null) { output.AddLink(first, m, weight); } else { if (settings.doSumExistingLinkWeights) { link.weight += weight; } else { // it will not create new link as it already exists // this is irrational in case settings.assignChunkTableWeightToLink is true if (settings.assignChunkTableWeightToLink) { throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings); } } } } } } } } return(output); }
/// <summary> /// Builds the cloud - common part of the algorithm /// </summary> /// <param name="c">The c.</param> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> /// <exception cref="aceScienceException">This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + /// ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense. - null - cloudConstructor has irrational settings</exception> protected lemmaSemanticCloud BuildCloud(lemmaSemanticConstruct c, webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, ITextResourceResolver resolver) { c.TrashBin.ForEach(x => c.nodeNames.Remove(x)); foreach (String n in c.nodeNames) // <------------ creating nodes { Double weight = 0; if (settings.assignTermTableWeightToNode) { var lemma = termTable[n]; if (lemma != null) { weight = lemma.weight; } } else { weight = 1; } if (weight > 0) { if (c.isCaseCloud) { if (settings.doFactorToCaseClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } else { // class cloud if (settings.doFactorToClassClouds) { if (c.PrimaryTerms.Contains(n)) { output.AddNode(n, weight * settings.PrimaryTermWeightFactor, 2); } else if (c.SecondaryTerms.Contains(n)) { output.AddNode(n, weight * settings.SecondaryTermWeightFactor, 1); } else { output.AddNode(n, weight * settings.ReserveTermWeightFactor, 0); } } else { output.AddNode(n, weight); } } } } foreach (List <String> n in c.lemmasList) // <-------- creating links { String first = n[0]; if (c.TrashBin.Contains(first)) { continue; } if (output.ContainsNode(first, true)) { foreach (String m in n) { if (c.TrashBin.Contains(m)) { continue; } if (m != first) { if (output.ContainsNode(m, true)) { Double weight = 1; if (settings.assignChunkTableWeightToLink) { weight = c.weightDict[n].weight; } else { if (settings.doAdjustLinkWeightByChunkSize) { weight = (n.Count - 1).GetRatio(1); } else { weight = 1; } } var link = output.GetLink(first, m); if (link == null) { output.AddLink(first, m, weight); } else { if (settings.doSumExistingLinkWeights) { link.weight += weight; } else { // it will not create new link as it already exists // this is irrational in case settings.assignChunkTableWeightToLink is true if (settings.assignChunkTableWeightToLink) { throw new aceScienceException("This is stupid. Settings for cloudConstructor have assignChunkTableWeightToLink=true but it will not create new link in case the lemmas are already linked" + ", therefore resulting weight is assigned just by chance! Change cloudConstructor settings bro, to make some sense.", null, this, "cloudConstructor has irrational settings", settings); } } } } } } } } c.primaryChunks.ForEach(x => output.primaryChunks.Add(x.nominalForm)); c.secondaryChunks.ForEach(x => output.secondaryChunks.Add(x.nominalForm)); return(output); }
/// <summary> /// Processes the complex. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); } lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); List <webLemmaTerm> allChunks = chunkTable.GetList(); // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS IEnumerable <webLemmaTerm> vipChunks = null; if (subjects.Count > 1) { vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { vipChunks = allChunks; } instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in vipChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); lemmaCounter.AddInstanceRange(lemmas); } c.RelevantTerms = lemmaCounter.getSorted(); lemmaCounter.reCalculate(); foreach (String term in c.RelevantTerms) { if (lemmaCounter[term] == lemmaCounter.maxFreq) { c.PrimaryTerms.Add(term); } else if (lemmaCounter[term] > lemmaCounter.minFreq) { c.SecondaryTerms.Add(term); } else { c.ReserveTerms.Add(term); } } c.CollectRelevantTerms(settings.doReserveTermsForClass); c.LogConstruct(logger); // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(lemmas); c.weightDict.Add(lemmas, chunk); c.nodeNames.AddRange(lemmas, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Processes the position enhanced. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processPOSEnhanced(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { List <webLemmaTerm> allChunks = chunkTable.GetList(); if (output == null) { output = new lemmaSemanticCloud(); } // <----------------- PRIMARY lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); lemmaSemanticConstruct cl = new lemmaSemanticConstruct(subjects); while (c != cl) { c = cl; cl = lemmaSemanticConstruct.NextIteration(cl, resolver, allChunks, settings, subjects, logger); if (cl.createdInIteration > settings.primaryTermOptimizationIterationLimit) { c = cl; break; } if (cl.OptimizationDone) { break; } } c = cl; // <------------------- PRIM c.CollectRelevantTerms(settings.doReserveTermsForClass); if (!c.isCaseCloud) { c.LogConstruct(logger); } // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); List <String> l_out = new List <string>(); foreach (String lm in lemmas) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.ReserveTerms.AddUnique(lm); l_out.Add(lm); } else { c.TrashBin.AddUnique(lm); } } } else { if (!c.TrashBin.Contains(lm)) { l_out.Add(lm); } } } if (l_out.Count > 1) { l_out.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(l_out); c.weightDict.Add(l_out, chunk); c.nodeNames.AddRange(l_out, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Returns expanded cloud from given lemma list - only for matched lemmas /// </summary> /// <param name="lemmas">The lemmas.</param> /// <param name="expansionSteps">The expansion steps.</param> /// <param name="options">The options.</param> /// <param name="typeToMin">todo: describe typeToMin parameter on ExpandTermsToCloud</param> /// <returns></returns> public lemmaSemanticCloud ExpandTermsToCloud(IEnumerable <String> lemmas, Int32 expansionSteps, Boolean typeToMin = true, lemmaExpansionOptions options = lemmaExpansionOptions.initialWeightFromParent | lemmaExpansionOptions.weightAsSemanticDistanceFromParent) { lemmaSemanticCloud output = new lemmaSemanticCloud(); output.name = name + "_subset_exp" + expansionSteps; output.DisableCheck = true; StringBuilder sb = new StringBuilder(); sb.Append("Subset expanded from matched query lemmas ["); List <String> nextTerms = new List <string>(); List <String> allTerms = new List <string>(); foreach (String t in lemmas) { if (ContainsNode(t)) { sb.Append(t + " "); var l = GetNode(t); output.AddNode(l.name, l.weight, 0).distance = 1; nextTerms.Add(t); allTerms.Add(t); } } sb.Append("] using cloud [" + name + "]"); output.description = sb.ToString(); Int32 exp_i = 1; while (nextTerms.Any()) { List <String> newNextTerms = new List <string>(); foreach (String t in nextTerms) { freeGraphNodeAndLinks links = new freeGraphNodeAndLinks(); if (options.HasFlag(lemmaExpansionOptions.weightAsSemanticDistanceFromParent)) { links = GetLinks(t, true, false, 1.GetRatio(exp_i), exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent)); } else if (options.HasFlag(lemmaExpansionOptions.weightAsSemanticDistanceThatIsSumOfLinkWeights)) { var nd = output.GetNode(t, true); links = GetLinks(t, true, false, 1, exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent)); } else { links = GetLinks(t, true, false, 1, exp_i, true, options.HasFlag(lemmaExpansionOptions.initialWeightFromParent)); } foreach (freeGraphLink link in links) { if (!allTerms.Contains(link.nodeA.name)) { newNextTerms.Add(link.nodeA.name); allTerms.Add(link.nodeA.name); } if (link.nodeA.name != t) { output.AddOrUpdateNode(link.nodeA, link, links, typeToMin, options); } if (!allTerms.Contains(link.nodeB.name)) { newNextTerms.Add(link.nodeB.name); allTerms.Add(link.nodeB.name); } if (link.nodeB.name != t) { output.AddOrUpdateNode(link.nodeB, link, links, typeToMin, options); } } foreach (freeGraphLink link in links) { if (!output.ContainsLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB)) { output.AddLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB, Math.Max(link.linkBase.weight, 1), link.linkBase.type); //var nd = output.GetNode(link.nodeB.name); //nd.weight = nd.weight + (link.nodeB.weight.GetRatio(links.linkedNodeClones.Count)); //nd.type = Math.Max(nd.type, link.nodeB.type); } else { var lnk = output.GetLink(link.linkBase.nodeNameA, link.linkBase.nodeNameB); lnk.weight += link.linkBase.weight; if (typeToMin) { lnk.type = Math.Min(link.linkBase.type, lnk.type); } else { lnk.type = Math.Max(link.linkBase.type, lnk.type); } } } } nextTerms = newNextTerms; exp_i++; if (exp_i > expansionSteps) { break; } } output.DisableCheck = false; output.RebuildIndex(); return(output); }