/// <summary> /// Prepares this instance - clears temporary data /// </summary> public override void prepare() { pageTitleCount = new instanceCountCollection <string>(); pageTitleCount.compareModeDefault = instanceCountCollectionFormulae.keyCount; // -- nothing to prepare }
/// <summary> /// Gets the counter. /// </summary> /// <param name="ofCurrentState">if set to <c>true</c> [of current state].</param> /// <returns></returns> public instanceCountCollection <String> GetCounter(Boolean ofCurrentState = true) { instanceCountCollection <String> counter = new instanceCountCollection <string>(); //lemmaSemanticCloud cloud = null; List <String> doneAnalysis = new List <string>(); foreach (lemmaSemanticCloud x in this.Get1stKeys()) { //if (cloud == null) cloud = x; foreach (lemmaSemanticCloud y in this.Get2ndKeys(x)) { //if (!doneAnalysis.Any(d => d.Contains(x.className) && d.Contains(y.className))) //{ // if (x != y) // { var nd = this[x, y]; foreach (var n in nd) { counter.AddInstance(n.name); } // doneAnalysis.Add(x.className + " " + y.className); // } //} /* * if (ofCurrentState) * { * * var nd = x.nodes; * foreach (var n in nd) * { * counter.AddInstance(n.name); * } * * } * else * { * * * * }*/ } } instanceCountCollection <String> output = new instanceCountCollection <string>(); foreach (String n in counter.Keys) { output.AddInstance(n, Convert.ToInt32(Math.Sqrt(counter[n]))); } output.reCalculate(); return(output); }
/// <summary> /// Gets the text token stats. /// </summary> /// <returns></returns> public instanceCountCollection <string> GetTextTokenStats() { instanceCountCollection <string> output = new instanceCountCollection <string>(); output.AddInstanceRange((IEnumerable <string>)textContent.getTokens(true, true, true)); return(output); }
/// <summary> /// Calculates the entropy. /// </summary> /// <returns></returns> public double CalculateEntropy() { instanceCountCollection <string> textStats = GetTextTokenStats(); textStats.reCalculate(); return(textStats.entropyFreq); }
public void process() { stats = new instanceCountCollection <tokenQueryResultEnum>(); foreach (tokenQuerySourceEnum key in Keys) { foreach (tokenQueryResponse res in this[key]) { stats.AddInstance(res.response, 1); if (res.description.isNullOrEmpty()) { description.Add(res.description); } //if (flags.Contains(res) } } }
public override void Learn(IEnumerable <TextDocumentSet> documentSets) { foreach (TextDocumentSet docSet in documentSets) { stats.Add(docSet.name, new Dictionary <String, instanceCountCollection <String> >()); foreach (TextDocumentLayerCollection document in docSet) { String content = document.ToString(); List <String> tkns = content.getTokens(true, true, true, false, 4); instanceCountCollection <string> ft = new instanceCountCollection <string>(); ft.AddInstanceRange(tkns); ft.reCalculate(instanceCountCollection <string> .preCalculateTasks.all); stats[docSet.name].Add(document.name, ft); } } }
/// <summary> /// Gets all proper tokens sorted by frequency. /// </summary> /// <param name="input_contentTokens">The input content tokens.</param> /// <param name="tokenLengthMin">The token length minimum.</param> /// <param name="input_ignoredTokens">The input ignored tokens.</param> /// <returns></returns> public List <string> GetAllProperTokensSortedByFrequency(IEnumerable <string> input_contentTokens, int tokenLengthMin, List <string> input_ignoredTokens) { instanceCountCollection <string> tokenFrequency = new instanceCountCollection <string>(); if (input_ignoredTokens == null) { input_ignoredTokens = new List <string>(); } // <----- preprocessing token input foreach (string token in input_contentTokens) { bool useOk = true; if (useOk && (token.isNullOrEmptyString())) { useOk = false; } if (useOk && (token == Environment.NewLine)) { useOk = false; } if (useOk && (token.Length < tokenLengthMin)) { useOk = false; } if (useOk && (input_ignoredTokens.Contains(token))) { useOk = false; } if (useOk && (globalIgnoreList.Contains(token))) { useOk = false; } if (useOk) { tokenFrequency.AddInstance(token); } } List <string> tokenToTest = tokenFrequency.getSorted(); return(tokenToTest); }
public Int32 GetDominantClass() { instanceCountCollection <Int32> counter = new instanceCountCollection <int>(); foreach (FeatureVector fv in this) { Int32 dd = fv.GetDominantDimension(); if (dd > -1) { counter.AddInstance(dd); } } if (counter.Count == 0) { return(-1); } return(counter.getSorted(1).First()); }
/// <summary> /// Prepares the factor by processing the context /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { statsByAssignedID.Clear(); foreach (DocumentSelectResultEntry docEntry in context.items) { instanceCountCollection <string> ft = new instanceCountCollection <string>(); if (docEntry.type.HasFlag(DocumentSelectEntryType.spaceDocument)) { SpaceDocumentModel document = docEntry.spaceDocument; foreach (var term in document.terms.GetTokens()) { ft.AddInstance(term, document.terms.GetTokenFrequency(term)); } } else if (docEntry.type.HasFlag(DocumentSelectEntryType.textDocument)) { String content = docEntry.textDocument.content; // document.ToString(); List <String> tkns = content.getTokens(true, true, true, false, 4); foreach (String tkn in tkns) { String stem = tkn; if (useStems) { stem = context.stemmingContext.Stem(tkn); } ft.AddInstance(stem); } } statsByAssignedID.Add(docEntry.AssignedID, ft); assignedIDs.Add(docEntry.AssignedID); } }
/// <summary> /// Exports the text report /// </summary> /// <param name="folder">The folder.</param> /// <param name="reduced">if set to <c>true</c> [reduced].</param> /// <param name="prefix">The prefix.</param> public void ExportTextReports(folderNode folder, Boolean reduced, String prefix = "") { foreach (lemmaSemanticCloud x in this.Get1stKeys()) { instanceCountCollection <string> c = GetCounter(reduced); var srt = c.getSorted(); String fn = prefix + x.className; if (reduced) { fn = fn + "_reduced_"; } else { fn = fn + "_initial_"; } fn = fn + "_overlap.txt"; fn = folder.pathFor(fn, imbSCI.Data.enums.getWritableFileMode.overwrite, "Cloud Frequency report for all terms in the Cloud Matrix"); List <String> lines = new List <string>(); foreach (string ci in srt) { if (reduced) { if (c[ci] > 1) { lines.Add(String.Format("{1} : {0}", ci, c[ci] - 1)); } } else { lines.Add(String.Format("{1} : {0}", ci, c[ci])); } } File.WriteAllLines(fn, lines); } }
/// <summary> /// Prepares this instance - clears temporary data /// </summary> public override void prepare() { titleWords = new instanceCountCollection <string>(); // -- nothing to prepare }
/// <summary> /// Processes the complex. /// </summary> /// <param name="chunkTable">The chunk table.</param> /// <param name="termTable">The term table.</param> /// <param name="output">The output.</param> /// <param name="logger">The logger.</param> /// <param name="subjects">The subjects.</param> /// <param name="resolver">The resolver.</param> /// <returns></returns> protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver) { if (output == null) { output = new lemmaSemanticCloud(); } lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects); List <webLemmaTerm> allChunks = chunkTable.GetList(); // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS IEnumerable <webLemmaTerm> vipChunks = null; if (subjects.Count > 1) { vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit); } else { vipChunks = allChunks; } instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in vipChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); lemmaCounter.AddInstanceRange(lemmas); } c.RelevantTerms = lemmaCounter.getSorted(); lemmaCounter.reCalculate(); foreach (String term in c.RelevantTerms) { if (lemmaCounter[term] == lemmaCounter.maxFreq) { c.PrimaryTerms.Add(term); } else if (lemmaCounter[term] > lemmaCounter.minFreq) { c.SecondaryTerms.Add(term); } else { c.ReserveTerms.Add(term); } } c.CollectRelevantTerms(settings.doReserveTermsForClass); c.LogConstruct(logger); // <--------------------------------- var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y))); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); if (lemmas.Count > 1) { lemmas.Sort((x, y) => String.CompareOrdinal(x, y)); c.lemmasList.Add(lemmas); c.weightDict.Add(lemmas, chunk); c.nodeNames.AddRange(lemmas, true); } } return(BuildCloud(c, chunkTable, termTable, output, logger, resolver)); }
/// <summary> /// Gets the value for cell targeted /// </summary> /// <param name="x">The x.</param> /// <param name="y">The y.</param> /// <param name="type">The type.</param> /// <param name="counter">The counter.</param> /// <returns></returns> public Double GetCellNumber(lemmaSemanticCloud x, lemmaSemanticCloud y, cloudMatrixDataTableType type, instanceCountCollection <String> counter) { Double output = 0; List <freeGraphNodeBase> selected = this[x, y]; Double min = MaxCloudFrequency; Double max = MinCloudFrequency; if (type.HasFlag(cloudMatrixDataTableType.overlapValue)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Sum(s => s.weight); } else { output = x.GetOverlap(y).Sum(s => s.weight); } } if (output == 0) { if (type.HasFlag(cloudMatrixDataTableType.normalizedValues)) { if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Count.GetRatio(MaxOverlap); } else { if (x == y) { output = 0; } else { output = x.GetOverlap(y).Count.GetRatio(selected.Count); } } } else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency)) { for (int i = 0; i < selected.Count; i++) { freeGraphNodeBase ne = selected[i]; min = Math.Min(min, (Double)counter[ne.name]); max = Math.Max(max, (Double)counter[ne.name]); } if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency)) { output = max.GetRatio(MaxCloudFrequency); } else { output = min.GetRatio(MinCloudFrequency); } } } else { if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { if (type.HasFlag(cloudMatrixDataTableType.initialState)) { output = selected.Count; } else { if (x == y) { output = 0; } else { output = x.GetOverlap(y).Count; } } } else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency)) { for (int i = 0; i < selected.Count; i++) { freeGraphNodeBase ne = selected[i]; min = Math.Min(min, (Double)counter[ne.name]); max = Math.Max(max, (Double)counter[ne.name]); } if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency)) { output = max; } else { output = min; } } } } return(output); }
public industryLemmaRankTable process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, industryLemmaRankTable output) { List <webLemmaTerm> allChunks = chunkTable.GetList(); var docSetFreq = allChunks.Where(x => x.documentSetFrequency > 1); instanceCountCollection <String> termCounter = new instanceCountCollection <string>(); aceDictionarySet <String, String> dict = new aceDictionarySet <string, string>(); foreach (webLemmaTerm chunk in docSetFreq) { var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true); lemmas = lemmas.Where(x => x.Length > 2).ToList(); termCounter.AddInstanceRange(lemmas); foreach (String lm in lemmas) { foreach (String lmi in lemmas) { if (lmi != lm) { dict[lm].AddUnique(lmi); } } } } List <String> primaries = new List <string>(); foreach (var pair in termCounter) { if (termCounter[pair] > 1) { primaries.Add(pair); industryLemmaTerm lemma = output.GetOrCreate(pair); lemma.termType = industryLemmaTermType.primary; lemma.weight = settings.PrimaryTermFactor * termTable[lemma.name].weight; lemma.nominalForm = pair; output.AddOrUpdate(lemma); if (dict.ContainsKey(lemma.nominalForm)) { foreach (String secLemmas in dict[lemma.nominalForm]) { industryLemmaTerm lemmaSec = output.GetOrCreate(secLemmas); if (lemmaSec.termType == industryLemmaTermType.none) { lemmaSec.termType = industryLemmaTermType.secondary; lemmaSec.weight = settings.SecondaryTermFactor * termTable[lemmaSec.name].weight; lemmaSec.nominalForm = secLemmas; output.AddOrUpdate(lemmaSec); } } } } } //var reserveChunks = allChunks.Where(x => x.nominalForm.ContainsAny(primaries)); //aceDictionarySet<String, String> dictReserve = new aceDictionarySet<string, string>(); //foreach (webLemmaTerm chunk in reserveChunks) //{ // var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true); // lemmas = lemmas.Where(x => x.Length > 2).ToList(); // String prim = lemmas.FirstOrDefault(x => primaries.Contains(x)); // if (!prim.isNullOrEmpty()) // { // foreach (String lm in lemmas) // { // if (prim != lm) // { // dictReserve[prim].AddUnique(lm); // } // } // } //} //foreach (String prim in primaries) //{ // if (dictReserve.ContainsKey(prim)) // { // foreach (String res in dictReserve[prim]) // { // industryLemmaTerm resLemma = output.GetOrCreate(res); // if (resLemma.termType == industryLemmaTermType.none) // { // resLemma.nominalForm = res; // resLemma.weight = settings.ReserveTermFactor *termTable[resLemma.name].weight; // resLemma.termType = industryLemmaTermType.reserve; // } // output.AddOrUpdate(resLemma); // } // } //} return(output); }
public static lemmaSemanticConstruct NextIteration(lemmaSemanticConstruct lastIteration, ITextResourceResolver resolver, List <webLemmaTerm> allChunks, cloudConstructorSettings settings, List <pipelineTaskMCSiteSubject> subjects, ILogBuilder logger) { var cl = lastIteration; var c = new lemmaSemanticConstruct(subjects); c.createdInIteration = lastIteration.createdInIteration + 1; c.PTCountMin = Math.Min(lastIteration.PTCountMin, lastIteration.PrimaryTerms.Count); c.PTCountMax = Math.Max(lastIteration.PTCountMax, lastIteration.PrimaryTerms.Count); if (!c.isCaseCloud) { c.onTopChunks.AddRange(allChunks.Where(x => x.documentSetFrequency > (settings.documentSetFreqLowLimit + lastIteration.createdInIteration))); } else { if (!settings.doFactorToCaseClouds) { c.OptimizationDone = true; } c.onTopChunks = allChunks; } if (!c.isCaseCloud) { instanceCountCollection <String> lemmaCounter = new instanceCountCollection <string>(); List <List <String> > primaryLemmaList = new List <List <String> >(); foreach (webLemmaTerm chunk in c.onTopChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); lemmaCounter.AddInstanceRange(lemmas); } lemmaCounter.reCalculate(); foreach (String st in lemmaCounter) { if (lemmaCounter.maxFreq == 1 || lemmaCounter[st] > 1) { var lu = resolver.GetLexicUnit(st, logger); if (lu == null) { c.TrashBin.AddUnique(st); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.Contains(pos_type.N)) { c.PrimaryTerms.AddUnique(st); } else if (tg.Contains(pos_type.A)) { c.SecondaryTerms.AddUnique(st); } else { c.TrashBin.AddUnique(st); } } } } ; // <---------------------------- Primary terms extracted if (c.PrimaryTerms.Count == 0) { if (c.SecondaryTerms.Any()) { logger.log(":: Moving Adjective terms [" + c.SecondaryTerms.Count + "] to Primary Terms category, as no Nouns were qualified to the cateogry"); c.PrimaryTerms.AddRange(c.SecondaryTerms); c.SecondaryTerms.Clear(); } } } instanceCountCollection <String> secondCounter = new instanceCountCollection <string>(); foreach (webLemmaTerm chunk in allChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); secondCounter.AddInstanceRange(lemmas); } foreach (webLemmaTerm chunk in allChunks) { var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true); if (lemmas.ContainsAny(c.PrimaryTerms)) { if (c.onTopChunks.Contains(chunk)) { c.primaryChunks.Add(chunk); } else { c.secondaryChunks.Add(chunk); } foreach (String lm in lemmas) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.SecondaryTerms.AddUnique(lm); } else { c.TrashBin.AddUnique(lm); } } } } } else { foreach (String lm in lemmas) { if (secondCounter[lm] > settings.termInChunkLowerLimit) { if (c.NotProcessed(lm)) { var lu = resolver.GetLexicUnit(lm, logger); if (lu == null) { c.TrashBin.AddUnique(lm); } else { var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none); if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A })) { c.ReserveTerms.AddUnique(lm); } else { c.TrashBin.AddUnique(lm); } } } } else { c.TrashBin.AddUnique(lm); } } } } if (c.OptimizationDone) { return(c); } c.PTCountMin = Math.Min(lastIteration.PTCountMin, c.PrimaryTerms.Count); c.PTCountMax = Math.Max(lastIteration.PTCountMax, c.PrimaryTerms.Count); if (c.PrimaryTerms.Count <= settings.primaryTermLowTargetCount) { if (lastIteration.PrimaryTerms.Count < c.PrimaryTerms.Count) { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count [" + c.PrimaryTerms.Count + "] after [" + c.createdInIteration + "] iterations optimized ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); } else { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "] --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); logger.log("[" + c.createdInIteration.ToString("D3") + "] previous PrimaryTerms count [" + lastIteration.PrimaryTerms.Count + "] accepted, after [" + c.createdInIteration + "] ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); c = lastIteration; } c.OptimizationDone = true; } else { logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "] --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name); } return(c); }
public bool discoverGram(termExploreItem item, ILogBuilder loger, bool debug = true) { //List<termExploreItem> inst = new List<termExploreItem>(); //exploreModel.instances.ForEach(x => inst.Add(x)); //inst.Add(exploreModel); // instanceCountCollection<pos_type> pct = new instanceCountCollection<pos_type>(); bool failed = false; //// <--------------- Trying to resolve alone //foreach (termExploreItem item in inst) //{ if (loger != null) { loger.AppendLine("Item:" + item.inputForm); } instanceCountCollection <object> res = termDiscoveryResolver.resolveQuery(item.inputForm); res.reCalculate(); if (res.Count > 0) { List <object> sorted = res.getSorted(); if (item.gramSet.getPosType() != pos_type.none) { sorted.RemoveAll(x => x is pos_type); } gramFlags gf = new gramFlags(); if (sorted.Any(x => x is pos_type)) { gf.Set((pos_type)sorted.First(x => x is pos_type)); } //pct.AddInstance(gf.type, 1); var tl = posConverter.posTypeVsPattern[gf.type]; sorted.RemoveAll(x => !tl.Contains(x.GetType())); if (loger != null) { loger.AppendLine("Votes:"); for (int i = 0; i < Math.Max(sorted.Count(), 20); i++) { loger.Append(sorted[i].ToString() + "; "); } } if (sorted.Any(x => x is pos_gender)) { gf.Set((pos_gender)sorted.First(x => x is pos_gender)); } if (sorted.Any(x => x is pos_gramaticalCase)) { gf.Set((pos_gramaticalCase)sorted.First(x => x is pos_gramaticalCase)); } if (sorted.Any(x => x is pos_verbform)) { gf.Set((pos_verbform)sorted.First(x => x is pos_verbform)); } if (sorted.Any(x => x is pos_number)) { gf.Set((pos_number)sorted.First(x => x is pos_number)); } if (sorted.Any(x => x is pos_degree)) { gf.Set((pos_degree)sorted.First(x => x is pos_degree)); } if (sorted.Any(x => x is pos_person)) { gf.Set((pos_person)sorted.First(x => x is pos_person)); } if (loger != null) { loger.AppendLine("Final gram:" + gf.ToString()); } item.gramSet.Add(gf); } else { if (item.inputForm.Length < 4) { return(false); } //item.flags = termExploreItemEnumFlag.none; failed = true; } return(failed); }
/// <summary> /// Builds the table. /// </summary> /// <param name="settings">The settings.</param> /// <param name="type">The type.</param> /// <returns></returns> public DataTable BuildTable(cloudMatrixSettings settings, cloudMatrixDataTableType type) { DataTable table = new DataTable(); table.SetTitle("CloudMatrix_" + name); table.SetDescription(description.or("Semantic cloud matrix report")); List <lemmaSemanticCloud> clouds = this.Get1stKeys().ToList(); Int32 ci = 0; foreach (lemmaSemanticCloud cl in clouds) { table.SetAdditionalInfoEntry("Cloud " + ci, cl.className); if (cl.className.isNullOrEmpty()) { cl.className = "C" + ci.ToString("D2"); } if (cl.name.isNullOrEmpty()) { cl.name = cl.className; } ci++; } instanceCountCollection <String> counter = GetCounter(type.HasFlag(cloudMatrixDataTableType.initialState)); String format = "F5"; if (type.HasFlag(cloudMatrixDataTableType.normalizedValues)) { format = "F5"; } else { format = ""; } table.Add("Class", "Name of DocumentSetClass attached to the semantic clouds", "", typeof(String), imbSCI.Core.enums.dataPointImportance.normal); for (int i = 0; i < clouds.Count; i++) { table.Add(clouds[i].className, clouds[i].description, "C_" + i.ToString(), typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, format, clouds[i].className); } table.Add("LemmasInitial", "Number of lemmas in the cloud, before reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.important, "", "Lemmas - initial"); table.Add("LinkRateInitial", "Link per node ratio, initial state", "", typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F3", "Link rate initial"); table.Add("LemmasAfter", "Number of lemmas in the cloud, after reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.important, "", "Lemmas - after"); table.Add("LinkRateAfter", "Link per node ratio, after reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "F3", "Link rate after"); for (int y = 0; y < clouds.Count; y++) { DataRow dr = table.NewRow(); dr["Class"] = clouds[y].className; for (int x = 0; x < clouds.Count; x++) { if (y == x) { dr[clouds[x].className] = 0; } else { dr[clouds[x].className] = GetCellNumber(clouds[x], clouds[y], type, counter); } } dr["LemmasInitial"] = numberOfLemmas[clouds[y]]; dr["LemmasAfter"] = clouds[y].CountNodes(); dr["LinkRateInitial"] = numberOfLinks[clouds[y]].GetRatio(numberOfLemmas[clouds[y]]); dr["LinkRateAfter"] = clouds[y].CountLinks().GetRatio(clouds[y].CountNodes()); table.Rows.Add(dr); } if (type.HasFlag(cloudMatrixDataTableType.overlapValue)) { DataRow dr = table.NewRow(); dr["Class"] = "Weight sums"; for (int y = 0; y < clouds.Count; y++) { Double sum = 0; for (int x = 0; x < clouds.Count; x++) { sum += this[clouds[x], clouds[y]].Sum(c => c.weight); // GetCellNumber(clouds[x], clouds[y], type, counter); } dr[clouds[y].className] = sum; //dr[clouds[x].name] = clouds[x].nodes.Sum(s => s.weight); } dr["LemmasInitial"] = 0; dr["LemmasAfter"] = 0; dr["LinkRateInitial"] = 0; dr["LinkRateAfter"] = 0; table.Rows.Add(dr); } var ty = type.getEnumListFromFlags <cloudMatrixDataTableType>(); foreach (cloudMatrixDataTableType t in ty) { table.SetAdditionalInfoEntry(t.toStringSafe(), t.toStringSafe().imbTitleCamelOperation(true)); } if (type.HasFlag(cloudMatrixDataTableType.initialState)) { table.AddExtra("The table shows the state of the matrix before transformation (filtration)."); } else { table.AddExtra("The table shows the state of the matrix after transformation (filtration)."); } if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { table.AddExtra("Values in the table are showing number of lemmas that are common to the clouds (of x and y axis)."); } else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency)) { table.AddExtra("Values in the table are showing highest Cloud Frequency for a term (at x and y axis)."); } else if (type.HasFlag(cloudMatrixDataTableType.minCloudFrequency)) { table.AddExtra("Values in the table are showing lowest Cloud Frequency for a term (at x and y axis)."); } else if (type.HasFlag(cloudMatrixDataTableType.overlapValue)) { table.AddExtra("Values in the table are showing sum of local weights for overlapping terms. The last row contains sum of weights for the class cloud."); } if (type.HasFlag(cloudMatrixDataTableType.normalizedValues)) { if (type.HasFlag(cloudMatrixDataTableType.overlapSize)) { table.AddExtra("The values are normalized to 0-1, where 1 is overlap size in initial state for each x,y cell."); } else { table.AddExtra("The values are normalized to 0-1."); } } else { table.AddExtra("The values are absolute."); } table.SetAdditionalInfoEntry("Max. CF", MaxCloudFrequency); table.SetAdditionalInfoEntry("Min. CF", MinCloudFrequency); table.SetAdditionalInfoEntry("Max. Overlap", MaxOverlap); table.SetAdditionalInfoEntry("Min. Overlap", MinOverlap); return(table); }
/// <summary> /// Transforms the clouds, related /// </summary> /// <param name="settings">The settings.</param> /// <param name="logger">The logger.</param> /// <param name="reductionReportName">Name of the reduction report.</param> /// <returns> /// Notes about reduced weights /// </returns> public cloudMatrixReductionReport TransformClouds(cloudMatrixSettings settings, ILogBuilder logger, String reductionReportName = "") { cloudMatrixReductionReport reductions = new cloudMatrixReductionReport(); reductions.name = reductionReportName; instanceCountCollection <String> counter = GetCounter(false); List <String> passNames = new List <string>(); List <String> removeNames = new List <string>(); List <String> removeByLPFNames = new List <string>(); List <String> setMiniNames = new List <string>(); // lemmaSemanticCloud cloud = this.First().Key; MinCloudFrequency = counter.minFreq; MaxCloudFrequency = counter.maxFreq; Double lowPass = settings.lowPassFilter; if (!settings.isActive) { logger.log("Cloud matrix disabled"); return(reductions); } if (settings.isFilterInAdaptiveMode) { lowPass = (MinCloudFrequency - 1) + lowPass; if (lowPass > MaxCloudFrequency) { lowPass = MaxCloudFrequency; } if (lowPass < 1) { lowPass = 1; } logger.log(": Cloud matrix filter in adaptive mode - cut off frequency set: " + lowPass); } var sorted = counter.getSorted(); // <------------------------------------------------------------------------------------------ LOW PASS FILTER LIST List <String> doNotReduceWeight = new List <string>(); foreach (String n in sorted) // <--------- performing cut of filter { if (settings.doCutOffByCloudFrequency) { Int32 freq = counter[n]; Boolean passOk = true; if (counter[n] > lowPass) { passOk = false; } if (passOk) { passNames.AddUnique(n); } else { if (settings.doAssignMicroWeightInsteadOfRemoval) { // passNames.AddUnique(n); setMiniNames.AddUnique(n); // reductions.Add("All", n, "[" + n + "] weight set to the microWeightNoiseGate limit"); // doNotReduceWeight.Add(n); } else { removeByLPFNames.AddUnique(n); //reductions.Add("[" + n + "] was removed"); } } } else { passNames.Add(n); } } // <------------------------------------------------------------------------------------------ LOW PASS FILTER LIST foreach (lemmaSemanticCloud y in this.Get1stKeys()) { y.RebuildIndex(); y.description = y.description + " filtered version of cloud"; reductions.Nodes += y.CountNodes(); reductions.InitialWeight += y.nodes.Sum(x => x.weight); } foreach (lemmaSemanticCloud cloud in this.Get1stKeys()) { // <--- apply LPF foreach (String setMini in setMiniNames) { var node = cloud.GetNode(setMini, true); if (node != null) { reductions.Add(cloud.name, node.name, node.weight, settings.microWeightNoiseGate, cloudMatrixReductionAction.LowPassFilter); node.weight = settings.microWeightNoiseGate; } } if (settings.doDivideWeightWithCloudFrequency || settings.doUseSquareFunctionOfCF) { Int32 rem = 0; foreach (String n in passNames) { var node = cloud.GetNode(n, true); if (node != null) { Double cf = counter[n]; if (settings.doDemoteAnyRepeatingSecondaryTerm) { if (cf > 1) { if (node.type == 1) { node.type = 0; reductions.Add(cloud.name, node.name, node.weight, node.weight, cloudMatrixReductionAction.Demotion); //node.weight = node.weight * 0.5; } } } if (settings.doRemoveAnyRepeatingPrimaryTerm) { if (cf > 1) { if (node.type == 2) { reductions.Add(cloud.name, node.name, node.weight, 0, cloudMatrixReductionAction.Demotion); node.weight = 0; } } } else if (settings.doDemoteAnyRepeatingPrimaryTerm) { if (cf > 1) { if (node.type == 2) { reductions.Add(cloud.name, node.name, node.weight, node.weight, cloudMatrixReductionAction.Demotion); //node.weight = node.weight * 0.5; node.type = 1; } } } if (!doNotReduceWeight.Contains(n)) { if (node.weight > 0) { //var cfd = cf + 1; if (cf > 1) { Double nw = node.weight; if (settings.doUseSquareFunctionOfCF) { node.weight = node.weight.GetRatio(cf * cf); } else { node.weight = node.weight.GetRatio(cf); } if (nw > node.weight) { reductions.Add(cloud.name, node.name, nw, node.weight, cloudMatrixReductionAction.CF_function); // reductions.Add("Term [" + node.name + "] weight [" + nw.ToString("F5") + "] reduced to [" + node.weight + "] in " + cloud.className + " CF[" + cf + "]"); } } } if (node.weight > settings.microWeightNoiseGate) { } else { if (node.weight < settings.microWeightNoiseGate) { removeNames.AddUnique(n); //y.Remove(n); rem++; } } } } } } } foreach (lemmaSemanticCloud y in this.Get1stKeys()) { Int32 rem = 0; foreach (String n in removeNames) { var node = y.GetNode(n); if (y.Remove(n)) { rem++; reductions.Add(y.name, node.name, node.weight, 0, cloudMatrixReductionAction.Microweight); //reductions.Add("Term [" + n + "] removed from [" + y.className + "]"); } } foreach (String n in removeByLPFNames) { var node = y.GetNode(n); if (y.Remove(n)) { rem++; reductions.Add(y.name, node.name, node.weight, 0, cloudMatrixReductionAction.LPFRemoval); } } if (rem > 0) { logger.log(y.className + ": Terms removed[" + rem.ToString("D6") + "] left[" + y.CountNodes().ToString("D6") + "]"); } } foreach (lemmaSemanticCloud y in this.Get1stKeys()) { y.RebuildIndex(); y.description = y.description + " filtered version of cloud"; // reductions.Nodes += y.CountNodes(); reductions.ReducedWeight += y.nodes.Sum(x => x.weight); } logger.log("Clouds transformation done."); return(reductions); }
/// <summary> /// Prepares for parallel execution. /// </summary> /// <param name="tools">The tools.</param> /// <param name="_context">The context.</param> public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context) { if (caseKnowledgeSet == null) { caseKnowledgeSet = new webProjectKnowledgeSet(); } if (items.Any()) { experimentContext.notes.log("Mining Context was ready already."); return(caseKnowledgeSet); } DateTime startTime = DateTime.Now; experimentContext = _context; List <webCaseKnowledge> cases = new List <webCaseKnowledge>(); folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data"); // <---------------------------------------------------------------------------------------------------------------- [ performing pipeline ] experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var pipelineContext = GetContextForPipeline(tools, classSet); sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>()); if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject))) { throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken"); } var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing foreach (var site in sitesForContext) { tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>()); sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject); webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet); caseKnowledgeSet.Add(webCase); cases.Add(webCase); } semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge(); kn.name = classSet.name + "_general"; kn.relatedItemPureName = classSet.name; kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory; kn.Deploy(classReportFolder, experimentContext.logger); knowledgeByClass.TryAdd(classSet, kn); } experimentContext.notes.log("Sorting tokens for all sites [in parallel]"); Parallel.ForEach(tokenBySite.Keys, site => { var leafs = site.getAllLeafs(); foreach (var leaf in leafs) { pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken; if (token != null) { tokenBySite[site].Add(token); } } }); foreach (var c in cases) { c.tokens = tokenBySite[c.MCSiteSubject]; } experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]"); Boolean useIntegratedApproach = false; if (useIntegratedApproach) { var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes); Parallel.ForEach(sitesByCategory, pair => { knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger)); }); } else { Parallel.ForEach(sitesByCategory, pair => { IDocumentSetClass category = pair.Key; List <pipelineTaskMCSiteSubject> sites = pair.Value; var lt = BuildLemmaTableForClass(tools, category, sites); lt.Save(); // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite); }); } experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run"); tools.SaveCache(); if (!useIntegratedApproach) { experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]"); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { BuildChunksForClass(tools, classSet); } foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); } } if (tools.operation.doCreateDiagnosticMatrixAtStart) { experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]"); folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things"); List <lemmaSemanticCloud> clouds = new List <lemmaSemanticCloud>(); List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>(); var converter = lemmaSemanticCloud.GetDGMLConverter(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false); var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource()); knowledgeByClass[classSet].semanticCloud.className = classSet.name; clouds.Add(cloud); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } else { converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]")); } knowledgeByClass[classSet].semanticCloudFiltered = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true); knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name; filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered); } cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes"); matrix.build(filteredClouds, experimentContext.logger); lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud(); mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories")); var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger); var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process"); File.WriteAllLines(p, reductions); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell); matrix.ExportTextReports(matrixReport, true, "matrix_cf"); matrix.ExportTextReports(matrixReport, false, "matrix_cf"); lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud(); mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied")); if (experimentContext.tools.operation.doUseSimpleGraphs) { mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } // <-------- analysis ----------------------------------------------------------------------------------- DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>(); foreach (var cl in filteredClouds) { freeGraphReport fgReport = new freeGraphReport(cl); fgReport.Save(matrixReport); cloudReports.AddRow(fgReport); } freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction); unifiedReport.Save(matrixReport); cloudReports.AddRow(unifiedReport); cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds"); // <-------- analysis ----------------------------------------------------------------------------------- foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList()); if (experimentContext.tools.operation.doUseSimpleGraphs) { cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } else { converter = lemmaSemanticCloud.GetDGMLConverter(); converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)")); } //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite); } instanceCountCollection <String> tfcounter = new instanceCountCollection <string>(); foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable(); wlt.DefaultView.Sort = "termFrequency desc"; var sorted = wlt.DefaultView.ToTable(); var tbl = wlt.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(sorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell); var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable(); cht.DefaultView.Sort = "termFrequency desc"; var csorted = cht.DefaultView.ToTable(); tbl = cht.GetClonedShema <DataTable>(true); tbl.CopyRowsFrom(csorted, 0, 100); tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell); tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved); knowledgeByClass[classSet].OnBeforeSave(); } List <String> countSorted = tfcounter.getSorted(); StringBuilder sb = new StringBuilder(); foreach (String s in countSorted) { sb.AppendLine(String.Format("{1} : {0}", s, tfcounter[s])); } String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens"); File.WriteAllText(pt, sb.ToString()); } if (tools.operation.doFullDiagnosticReport) { experimentContext.notes.log("Generating full diagnostic report on classes..."); DataTable rep = null; foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses()) { rep = this.GetClassKnowledgeReport(classSet, rep); } rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name); rep.AddExtra("Experiment: " + experimentContext.setup.name); rep.AddExtra("Info: " + experimentContext.setup.description); rep.SetDescription("Structural report for all classes in the experiment"); rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell); } classReportFolder.generateReadmeFiles(appManager.AppInfo); experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes"); return(caseKnowledgeSet); }