/// <summary>
        /// Prepares this instance - clears temporary data
        /// </summary>
        public override void prepare()
        {
            pageTitleCount = new instanceCountCollection <string>();
            pageTitleCount.compareModeDefault = instanceCountCollectionFormulae.keyCount;

            // -- nothing to prepare
        }
Example #2
0
        /// <summary>
        /// Gets the counter.
        /// </summary>
        /// <param name="ofCurrentState">if set to <c>true</c> [of current state].</param>
        /// <returns></returns>
        public instanceCountCollection <String> GetCounter(Boolean ofCurrentState = true)
        {
            instanceCountCollection <String> counter = new instanceCountCollection <string>();
            //lemmaSemanticCloud cloud = null;

            List <String> doneAnalysis = new List <string>();


            foreach (lemmaSemanticCloud x in this.Get1stKeys())
            {
                //if (cloud == null) cloud = x;

                foreach (lemmaSemanticCloud y in this.Get2ndKeys(x))
                {
                    //if (!doneAnalysis.Any(d => d.Contains(x.className) && d.Contains(y.className)))
                    //{
                    //    if (x != y)
                    //    {
                    var nd = this[x, y];
                    foreach (var n in nd)
                    {
                        counter.AddInstance(n.name);
                    }

                    //        doneAnalysis.Add(x.className + " " + y.className);
                    //    }
                    //}

                    /*
                     * if (ofCurrentState)
                     * {
                     *
                     *  var nd = x.nodes;
                     *  foreach (var n in nd)
                     *  {
                     *      counter.AddInstance(n.name);
                     *  }
                     *
                     * }
                     * else
                     * {
                     *
                     *
                     *
                     * }*/
                }
            }

            instanceCountCollection <String> output = new instanceCountCollection <string>();

            foreach (String n in counter.Keys)
            {
                output.AddInstance(n, Convert.ToInt32(Math.Sqrt(counter[n])));
            }


            output.reCalculate();
            return(output);
        }
Example #3
0
        /// <summary>
        /// Gets the text token stats.
        /// </summary>
        /// <returns></returns>
        public instanceCountCollection <string> GetTextTokenStats()
        {
            instanceCountCollection <string> output = new instanceCountCollection <string>();

            output.AddInstanceRange((IEnumerable <string>)textContent.getTokens(true, true, true));

            return(output);
        }
Example #4
0
        /// <summary>
        /// Calculates the entropy.
        /// </summary>
        /// <returns></returns>
        public double CalculateEntropy()
        {
            instanceCountCollection <string> textStats = GetTextTokenStats();

            textStats.reCalculate();

            return(textStats.entropyFreq);
        }
 public void process()
 {
     stats = new instanceCountCollection <tokenQueryResultEnum>();
     foreach (tokenQuerySourceEnum key in Keys)
     {
         foreach (tokenQueryResponse res in this[key])
         {
             stats.AddInstance(res.response, 1);
             if (res.description.isNullOrEmpty())
             {
                 description.Add(res.description);
             }
             //if (flags.Contains(res)
         }
     }
 }
        public override void Learn(IEnumerable <TextDocumentSet> documentSets)
        {
            foreach (TextDocumentSet docSet in documentSets)
            {
                stats.Add(docSet.name, new Dictionary <String, instanceCountCollection <String> >());

                foreach (TextDocumentLayerCollection document in docSet)
                {
                    String content = document.ToString();

                    List <String> tkns = content.getTokens(true, true, true, false, 4);
                    instanceCountCollection <string> ft = new instanceCountCollection <string>();
                    ft.AddInstanceRange(tkns);

                    ft.reCalculate(instanceCountCollection <string> .preCalculateTasks.all);
                    stats[docSet.name].Add(document.name, ft);
                }
            }
        }
Example #7
0
        /// <summary>
        /// Gets all proper tokens sorted by frequency.
        /// </summary>
        /// <param name="input_contentTokens">The input content tokens.</param>
        /// <param name="tokenLengthMin">The token length minimum.</param>
        /// <param name="input_ignoredTokens">The input ignored tokens.</param>
        /// <returns></returns>
        public List <string> GetAllProperTokensSortedByFrequency(IEnumerable <string> input_contentTokens, int tokenLengthMin, List <string> input_ignoredTokens)
        {
            instanceCountCollection <string> tokenFrequency = new instanceCountCollection <string>();

            if (input_ignoredTokens == null)
            {
                input_ignoredTokens = new List <string>();
            }
            // <----- preprocessing token input
            foreach (string token in input_contentTokens)
            {
                bool useOk = true;
                if (useOk && (token.isNullOrEmptyString()))
                {
                    useOk = false;
                }
                if (useOk && (token == Environment.NewLine))
                {
                    useOk = false;
                }
                if (useOk && (token.Length < tokenLengthMin))
                {
                    useOk = false;
                }
                if (useOk && (input_ignoredTokens.Contains(token)))
                {
                    useOk = false;
                }
                if (useOk && (globalIgnoreList.Contains(token)))
                {
                    useOk = false;
                }
                if (useOk)
                {
                    tokenFrequency.AddInstance(token);
                }
            }

            List <string> tokenToTest = tokenFrequency.getSorted();

            return(tokenToTest);
        }
        public Int32 GetDominantClass()
        {
            instanceCountCollection <Int32> counter = new instanceCountCollection <int>();

            foreach (FeatureVector fv in this)
            {
                Int32 dd = fv.GetDominantDimension();
                if (dd > -1)
                {
                    counter.AddInstance(dd);
                }
            }

            if (counter.Count == 0)
            {
                return(-1);
            }

            return(counter.getSorted(1).First());
        }
        /// <summary>
        /// Prepares the factor by processing the context
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public override void Prepare(DocumentSelectResult context, ILogBuilder log)
        {
            statsByAssignedID.Clear();


            foreach (DocumentSelectResultEntry docEntry in context.items)
            {
                instanceCountCollection <string> ft = new instanceCountCollection <string>();

                if (docEntry.type.HasFlag(DocumentSelectEntryType.spaceDocument))
                {
                    SpaceDocumentModel document = docEntry.spaceDocument;
                    foreach (var term in document.terms.GetTokens())
                    {
                        ft.AddInstance(term, document.terms.GetTokenFrequency(term));
                    }
                }
                else if (docEntry.type.HasFlag(DocumentSelectEntryType.textDocument))
                {
                    String content = docEntry.textDocument.content; // document.ToString();

                    List <String> tkns = content.getTokens(true, true, true, false, 4);

                    foreach (String tkn in tkns)
                    {
                        String stem = tkn;
                        if (useStems)
                        {
                            stem = context.stemmingContext.Stem(tkn);
                        }
                        ft.AddInstance(stem);
                    }
                }
                statsByAssignedID.Add(docEntry.AssignedID, ft);

                assignedIDs.Add(docEntry.AssignedID);
            }
        }
Example #10
0
        /// <summary>
        /// Exports the text report
        /// </summary>
        /// <param name="folder">The folder.</param>
        /// <param name="reduced">if set to <c>true</c> [reduced].</param>
        /// <param name="prefix">The prefix.</param>
        public void ExportTextReports(folderNode folder, Boolean reduced, String prefix = "")
        {
            foreach (lemmaSemanticCloud x in this.Get1stKeys())
            {
                instanceCountCollection <string> c = GetCounter(reduced);
                var srt = c.getSorted();

                String fn = prefix + x.className;
                if (reduced)
                {
                    fn = fn + "_reduced_";
                }
                else
                {
                    fn = fn + "_initial_";
                }

                fn = fn + "_overlap.txt";
                fn = folder.pathFor(fn, imbSCI.Data.enums.getWritableFileMode.overwrite, "Cloud Frequency report for all terms in the Cloud Matrix");
                List <String> lines = new List <string>();
                foreach (string ci in srt)
                {
                    if (reduced)
                    {
                        if (c[ci] > 1)
                        {
                            lines.Add(String.Format("{1}  :   {0}", ci, c[ci] - 1));
                        }
                    }
                    else
                    {
                        lines.Add(String.Format("{1}  :   {0}", ci, c[ci]));
                    }
                }
                File.WriteAllLines(fn, lines);
            }
        }
 /// <summary>
 /// Prepares this instance - clears temporary data
 /// </summary>
 public override void prepare()
 {
     titleWords = new instanceCountCollection <string>();
     // -- nothing to prepare
 }
Example #12
0
        /// <summary>
        /// Processes the complex.
        /// </summary>
        /// <param name="chunkTable">The chunk table.</param>
        /// <param name="termTable">The term table.</param>
        /// <param name="output">The output.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="subjects">The subjects.</param>
        /// <param name="resolver">The resolver.</param>
        /// <returns></returns>
        protected lemmaSemanticCloud processAlternative(webLemmaTermTable chunkTable, webLemmaTermTable termTable, lemmaSemanticCloud output, ILogBuilder logger, List <pipelineTaskMCSiteSubject> subjects, ITextResourceResolver resolver)
        {
            if (output == null)
            {
                output = new lemmaSemanticCloud();
            }
            lemmaSemanticConstruct c = new lemmaSemanticConstruct(subjects);



            List <webLemmaTerm> allChunks = chunkTable.GetList();

            // <--------------------------------- DETECTING THE MOST IMPORTANT TERMS
            IEnumerable <webLemmaTerm> vipChunks = null;

            if (subjects.Count > 1)
            {
                vipChunks = allChunks.Where(x => x.documentSetFrequency > settings.documentSetFreqLowLimit);
            }
            else
            {
                vipChunks = allChunks;
            }

            instanceCountCollection <String> lemmaCounter     = new instanceCountCollection <string>();
            List <List <String> >            primaryLemmaList = new List <List <String> >();

            foreach (webLemmaTerm chunk in vipChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                lemmaCounter.AddInstanceRange(lemmas);
            }

            c.RelevantTerms = lemmaCounter.getSorted();


            lemmaCounter.reCalculate();

            foreach (String term in c.RelevantTerms)
            {
                if (lemmaCounter[term] == lemmaCounter.maxFreq)
                {
                    c.PrimaryTerms.Add(term);
                }
                else if (lemmaCounter[term] > lemmaCounter.minFreq)
                {
                    c.SecondaryTerms.Add(term);
                }
                else
                {
                    c.ReserveTerms.Add(term);
                }
            }


            c.CollectRelevantTerms(settings.doReserveTermsForClass);
            c.LogConstruct(logger);



            // <---------------------------------

            var docSetFreq = allChunks.Where(x => c.RelevantTerms.Any(y => x.nominalForm.SplitSmart(" ", "", true, true).Contains(y)));

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();

                if (lemmas.Count > 1)
                {
                    lemmas.Sort((x, y) => String.CompareOrdinal(x, y));
                    c.lemmasList.Add(lemmas);

                    c.weightDict.Add(lemmas, chunk);

                    c.nodeNames.AddRange(lemmas, true);
                }
            }

            return(BuildCloud(c, chunkTable, termTable, output, logger, resolver));
        }
Example #13
0
        /// <summary>
        /// Gets the value for cell targeted
        /// </summary>
        /// <param name="x">The x.</param>
        /// <param name="y">The y.</param>
        /// <param name="type">The type.</param>
        /// <param name="counter">The counter.</param>
        /// <returns></returns>
        public Double GetCellNumber(lemmaSemanticCloud x, lemmaSemanticCloud y, cloudMatrixDataTableType type, instanceCountCollection <String> counter)
        {
            Double output = 0;

            List <freeGraphNodeBase> selected = this[x, y];

            Double min = MaxCloudFrequency;
            Double max = MinCloudFrequency;



            if (type.HasFlag(cloudMatrixDataTableType.overlapValue))
            {
                if (type.HasFlag(cloudMatrixDataTableType.initialState))
                {
                    output = selected.Sum(s => s.weight);
                }
                else
                {
                    output = x.GetOverlap(y).Sum(s => s.weight);
                }
            }

            if (output == 0)
            {
                if (type.HasFlag(cloudMatrixDataTableType.normalizedValues))
                {
                    if (type.HasFlag(cloudMatrixDataTableType.overlapSize))
                    {
                        if (type.HasFlag(cloudMatrixDataTableType.initialState))
                        {
                            output = selected.Count.GetRatio(MaxOverlap);
                        }
                        else
                        {
                            if (x == y)
                            {
                                output = 0;
                            }
                            else
                            {
                                output = x.GetOverlap(y).Count.GetRatio(selected.Count);
                            }
                        }
                    }
                    else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency))
                    {
                        for (int i = 0; i < selected.Count; i++)
                        {
                            freeGraphNodeBase ne = selected[i];
                            min = Math.Min(min, (Double)counter[ne.name]);
                            max = Math.Max(max, (Double)counter[ne.name]);
                        }

                        if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency))
                        {
                            output = max.GetRatio(MaxCloudFrequency);
                        }
                        else
                        {
                            output = min.GetRatio(MinCloudFrequency);
                        }
                    }
                }
                else
                {
                    if (type.HasFlag(cloudMatrixDataTableType.overlapSize))
                    {
                        if (type.HasFlag(cloudMatrixDataTableType.initialState))
                        {
                            output = selected.Count;
                        }
                        else
                        {
                            if (x == y)
                            {
                                output = 0;
                            }
                            else
                            {
                                output = x.GetOverlap(y).Count;
                            }
                        }
                    }
                    else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency) || type.HasFlag(cloudMatrixDataTableType.minCloudFrequency))
                    {
                        for (int i = 0; i < selected.Count; i++)
                        {
                            freeGraphNodeBase ne = selected[i];
                            min = Math.Min(min, (Double)counter[ne.name]);
                            max = Math.Max(max, (Double)counter[ne.name]);
                        }

                        if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency))
                        {
                            output = max;
                        }
                        else
                        {
                            output = min;
                        }
                    }
                }
            }

            return(output);
        }
Example #14
0
        public industryLemmaRankTable process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, industryLemmaRankTable output)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            var docSetFreq = allChunks.Where(x => x.documentSetFrequency > 1);

            instanceCountCollection <String> termCounter = new instanceCountCollection <string>();

            aceDictionarySet <String, String> dict = new aceDictionarySet <string, string>();

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();
                termCounter.AddInstanceRange(lemmas);

                foreach (String lm in lemmas)
                {
                    foreach (String lmi in lemmas)
                    {
                        if (lmi != lm)
                        {
                            dict[lm].AddUnique(lmi);
                        }
                    }
                }
            }

            List <String> primaries = new List <string>();

            foreach (var pair in termCounter)
            {
                if (termCounter[pair] > 1)
                {
                    primaries.Add(pair);
                    industryLemmaTerm lemma = output.GetOrCreate(pair);
                    lemma.termType = industryLemmaTermType.primary;
                    lemma.weight   = settings.PrimaryTermFactor * termTable[lemma.name].weight;


                    lemma.nominalForm = pair;
                    output.AddOrUpdate(lemma);

                    if (dict.ContainsKey(lemma.nominalForm))
                    {
                        foreach (String secLemmas in dict[lemma.nominalForm])
                        {
                            industryLemmaTerm lemmaSec = output.GetOrCreate(secLemmas);
                            if (lemmaSec.termType == industryLemmaTermType.none)
                            {
                                lemmaSec.termType    = industryLemmaTermType.secondary;
                                lemmaSec.weight      = settings.SecondaryTermFactor * termTable[lemmaSec.name].weight;
                                lemmaSec.nominalForm = secLemmas;
                                output.AddOrUpdate(lemmaSec);
                            }
                        }
                    }
                }
            }

            //var reserveChunks = allChunks.Where(x => x.nominalForm.ContainsAny(primaries));

            //aceDictionarySet<String, String> dictReserve = new aceDictionarySet<string, string>();

            //foreach (webLemmaTerm chunk in reserveChunks)
            //{
            //    var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
            //    lemmas = lemmas.Where(x => x.Length > 2).ToList();

            //    String prim = lemmas.FirstOrDefault(x => primaries.Contains(x));

            //    if (!prim.isNullOrEmpty())
            //    {
            //        foreach (String lm in lemmas)
            //        {
            //            if (prim != lm)
            //            {
            //                dictReserve[prim].AddUnique(lm);
            //            }
            //        }
            //    }

            //}

            //foreach (String prim in primaries)
            //{
            //    if (dictReserve.ContainsKey(prim))
            //    {
            //        foreach (String res in dictReserve[prim])
            //        {
            //            industryLemmaTerm resLemma = output.GetOrCreate(res);
            //            if (resLemma.termType == industryLemmaTermType.none)
            //            {

            //                resLemma.nominalForm = res;
            //                resLemma.weight = settings.ReserveTermFactor  *termTable[resLemma.name].weight;
            //                resLemma.termType = industryLemmaTermType.reserve;
            //            }
            //            output.AddOrUpdate(resLemma);
            //        }

            //    }
            //}

            return(output);
        }
Example #15
0
        public static lemmaSemanticConstruct NextIteration(lemmaSemanticConstruct lastIteration, ITextResourceResolver resolver, List <webLemmaTerm> allChunks, cloudConstructorSettings settings, List <pipelineTaskMCSiteSubject> subjects, ILogBuilder logger)
        {
            var cl = lastIteration;

            var c = new lemmaSemanticConstruct(subjects);

            c.createdInIteration = lastIteration.createdInIteration + 1;
            c.PTCountMin         = Math.Min(lastIteration.PTCountMin, lastIteration.PrimaryTerms.Count);
            c.PTCountMax         = Math.Max(lastIteration.PTCountMax, lastIteration.PrimaryTerms.Count);

            if (!c.isCaseCloud)
            {
                c.onTopChunks.AddRange(allChunks.Where(x => x.documentSetFrequency > (settings.documentSetFreqLowLimit + lastIteration.createdInIteration)));
            }
            else
            {
                if (!settings.doFactorToCaseClouds)
                {
                    c.OptimizationDone = true;
                }
                c.onTopChunks = allChunks;
            }

            if (!c.isCaseCloud)
            {
                instanceCountCollection <String> lemmaCounter     = new instanceCountCollection <string>();
                List <List <String> >            primaryLemmaList = new List <List <String> >();

                foreach (webLemmaTerm chunk in c.onTopChunks)
                {
                    var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                    lemmaCounter.AddInstanceRange(lemmas);
                }

                lemmaCounter.reCalculate();

                foreach (String st in lemmaCounter)
                {
                    if (lemmaCounter.maxFreq == 1 || lemmaCounter[st] > 1)
                    {
                        var lu = resolver.GetLexicUnit(st, logger);
                        if (lu == null)
                        {
                            c.TrashBin.AddUnique(st);
                        }
                        else
                        {
                            var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                            if (tg.Contains(pos_type.N))
                            {
                                c.PrimaryTerms.AddUnique(st);
                            }
                            else if (tg.Contains(pos_type.A))
                            {
                                c.SecondaryTerms.AddUnique(st);
                            }
                            else
                            {
                                c.TrashBin.AddUnique(st);
                            }
                        }
                    }
                }
                ;  // <---------------------------- Primary terms extracted

                if (c.PrimaryTerms.Count == 0)
                {
                    if (c.SecondaryTerms.Any())
                    {
                        logger.log(":: Moving Adjective terms [" + c.SecondaryTerms.Count + "] to Primary Terms category, as no Nouns were qualified to the cateogry");
                        c.PrimaryTerms.AddRange(c.SecondaryTerms);
                        c.SecondaryTerms.Clear();
                    }
                }
            }

            instanceCountCollection <String> secondCounter = new instanceCountCollection <string>();

            foreach (webLemmaTerm chunk in allChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);
                secondCounter.AddInstanceRange(lemmas);
            }

            foreach (webLemmaTerm chunk in allChunks)
            {
                var lemmas = chunk.nominalForm.SplitSmart(" ", "", true, true);

                if (lemmas.ContainsAny(c.PrimaryTerms))
                {
                    if (c.onTopChunks.Contains(chunk))
                    {
                        c.primaryChunks.Add(chunk);
                    }
                    else
                    {
                        c.secondaryChunks.Add(chunk);
                    }

                    foreach (String lm in lemmas)
                    {
                        if (c.NotProcessed(lm))
                        {
                            var lu = resolver.GetLexicUnit(lm, logger);
                            if (lu == null)
                            {
                                c.TrashBin.AddUnique(lm);
                            }
                            else
                            {
                                var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                                if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                                {
                                    c.SecondaryTerms.AddUnique(lm);
                                }
                                else
                                {
                                    c.TrashBin.AddUnique(lm);
                                }
                            }
                        }
                    }
                }
                else
                {
                    foreach (String lm in lemmas)
                    {
                        if (secondCounter[lm] > settings.termInChunkLowerLimit)
                        {
                            if (c.NotProcessed(lm))
                            {
                                var lu = resolver.GetLexicUnit(lm, logger);
                                if (lu == null)
                                {
                                    c.TrashBin.AddUnique(lm);
                                }
                                else
                                {
                                    var tg = lu.GetTagFromGramTags <pos_type>(pos_type.none);
                                    if (tg.ContainsAny(new pos_type[] { pos_type.N, pos_type.A }))
                                    {
                                        c.ReserveTerms.AddUnique(lm);
                                    }
                                    else
                                    {
                                        c.TrashBin.AddUnique(lm);
                                    }
                                }
                            }
                        }
                        else
                        {
                            c.TrashBin.AddUnique(lm);
                        }
                    }
                }
            }

            if (c.OptimizationDone)
            {
                return(c);
            }

            c.PTCountMin = Math.Min(lastIteration.PTCountMin, c.PrimaryTerms.Count);
            c.PTCountMax = Math.Max(lastIteration.PTCountMax, c.PrimaryTerms.Count);

            if (c.PrimaryTerms.Count <= settings.primaryTermLowTargetCount)
            {
                if (lastIteration.PrimaryTerms.Count < c.PrimaryTerms.Count)
                {
                    logger.log("[" +
                               c.createdInIteration.ToString("D3") + "] PrimaryTerms count [" + c.PrimaryTerms.Count + "] after [" + c.createdInIteration + "] iterations optimized ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name);
                }
                else
                {
                    logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "]  --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "]  T:" + Thread.CurrentThread.Name);

                    logger.log("[" +
                               c.createdInIteration.ToString("D3") + "] previous PrimaryTerms count [" + lastIteration.PrimaryTerms.Count + "] accepted, after [" + c.createdInIteration + "]  ---- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "] T:" + Thread.CurrentThread.Name);
                    c = lastIteration;
                }

                c.OptimizationDone = true;
            }
            else
            {
                logger.log("[" + c.createdInIteration.ToString("D3") + "] PrimaryTerms count changed from [" + lastIteration.PrimaryTerms.Count + "] to [" + c.PrimaryTerms.Count + "]  --- Max[" + c.PTCountMax + "] Min[" + c.PTCountMin + "]  T:" + Thread.CurrentThread.Name);
            }

            return(c);
        }
Example #16
0
        public bool discoverGram(termExploreItem item, ILogBuilder loger, bool debug = true)
        {
            //List<termExploreItem> inst = new List<termExploreItem>();
            //exploreModel.instances.ForEach(x => inst.Add(x));

            //inst.Add(exploreModel);

            // instanceCountCollection<pos_type> pct = new instanceCountCollection<pos_type>();
            bool failed = false;

            //// <--------------- Trying to resolve alone
            //foreach (termExploreItem item in inst)
            //{


            if (loger != null)
            {
                loger.AppendLine("Item:" + item.inputForm);
            }

            instanceCountCollection <object> res = termDiscoveryResolver.resolveQuery(item.inputForm);

            res.reCalculate();

            if (res.Count > 0)
            {
                List <object> sorted = res.getSorted();

                if (item.gramSet.getPosType() != pos_type.none)
                {
                    sorted.RemoveAll(x => x is pos_type);
                }

                gramFlags gf = new gramFlags();

                if (sorted.Any(x => x is pos_type))
                {
                    gf.Set((pos_type)sorted.First(x => x is pos_type));
                }
                //pct.AddInstance(gf.type, 1);

                var tl = posConverter.posTypeVsPattern[gf.type];
                sorted.RemoveAll(x => !tl.Contains(x.GetType()));

                if (loger != null)
                {
                    loger.AppendLine("Votes:");
                    for (int i = 0; i < Math.Max(sorted.Count(), 20); i++)
                    {
                        loger.Append(sorted[i].ToString() + "; ");
                    }
                }

                if (sorted.Any(x => x is pos_gender))
                {
                    gf.Set((pos_gender)sorted.First(x => x is pos_gender));
                }
                if (sorted.Any(x => x is pos_gramaticalCase))
                {
                    gf.Set((pos_gramaticalCase)sorted.First(x => x is pos_gramaticalCase));
                }
                if (sorted.Any(x => x is pos_verbform))
                {
                    gf.Set((pos_verbform)sorted.First(x => x is pos_verbform));
                }
                if (sorted.Any(x => x is pos_number))
                {
                    gf.Set((pos_number)sorted.First(x => x is pos_number));
                }
                if (sorted.Any(x => x is pos_degree))
                {
                    gf.Set((pos_degree)sorted.First(x => x is pos_degree));
                }
                if (sorted.Any(x => x is pos_person))
                {
                    gf.Set((pos_person)sorted.First(x => x is pos_person));
                }


                if (loger != null)
                {
                    loger.AppendLine("Final gram:" + gf.ToString());
                }
                item.gramSet.Add(gf);
            }
            else
            {
                if (item.inputForm.Length < 4)
                {
                    return(false);
                }
                //item.flags = termExploreItemEnumFlag.none;
                failed = true;
            }

            return(failed);
        }
Example #17
0
        /// <summary>
        /// Builds the table.
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="type">The type.</param>
        /// <returns></returns>
        public DataTable BuildTable(cloudMatrixSettings settings, cloudMatrixDataTableType type)
        {
            DataTable table = new DataTable();

            table.SetTitle("CloudMatrix_" + name);
            table.SetDescription(description.or("Semantic cloud matrix report"));

            List <lemmaSemanticCloud> clouds = this.Get1stKeys().ToList();

            Int32 ci = 0;

            foreach (lemmaSemanticCloud cl in clouds)
            {
                table.SetAdditionalInfoEntry("Cloud " + ci, cl.className);
                if (cl.className.isNullOrEmpty())
                {
                    cl.className = "C" + ci.ToString("D2");
                }
                if (cl.name.isNullOrEmpty())
                {
                    cl.name = cl.className;
                }
                ci++;
            }

            instanceCountCollection <String> counter = GetCounter(type.HasFlag(cloudMatrixDataTableType.initialState));

            String format = "F5";

            if (type.HasFlag(cloudMatrixDataTableType.normalizedValues))
            {
                format = "F5";
            }
            else
            {
                format = "";
            }

            table.Add("Class", "Name of DocumentSetClass attached to the semantic clouds", "", typeof(String), imbSCI.Core.enums.dataPointImportance.normal);

            for (int i = 0; i < clouds.Count; i++)
            {
                table.Add(clouds[i].className, clouds[i].description, "C_" + i.ToString(), typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, format, clouds[i].className);
            }

            table.Add("LemmasInitial", "Number of lemmas in the cloud, before reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.important, "", "Lemmas - initial");

            table.Add("LinkRateInitial", "Link per node ratio, initial state", "", typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F3", "Link rate initial");
            table.Add("LemmasAfter", "Number of lemmas in the cloud, after reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.important, "", "Lemmas - after");

            table.Add("LinkRateAfter", "Link per node ratio, after reduction", "", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "F3", "Link rate after");

            for (int y = 0; y < clouds.Count; y++)
            {
                DataRow dr = table.NewRow();

                dr["Class"] = clouds[y].className;

                for (int x = 0; x < clouds.Count; x++)
                {
                    if (y == x)
                    {
                        dr[clouds[x].className] = 0;
                    }
                    else
                    {
                        dr[clouds[x].className] = GetCellNumber(clouds[x], clouds[y], type, counter);
                    }
                }

                dr["LemmasInitial"] = numberOfLemmas[clouds[y]];
                dr["LemmasAfter"]   = clouds[y].CountNodes();

                dr["LinkRateInitial"] = numberOfLinks[clouds[y]].GetRatio(numberOfLemmas[clouds[y]]);
                dr["LinkRateAfter"]   = clouds[y].CountLinks().GetRatio(clouds[y].CountNodes());

                table.Rows.Add(dr);
            }

            if (type.HasFlag(cloudMatrixDataTableType.overlapValue))
            {
                DataRow dr = table.NewRow();

                dr["Class"] = "Weight sums";

                for (int y = 0; y < clouds.Count; y++)
                {
                    Double sum = 0;
                    for (int x = 0; x < clouds.Count; x++)
                    {
                        sum += this[clouds[x], clouds[y]].Sum(c => c.weight);  // GetCellNumber(clouds[x], clouds[y], type, counter);
                    }
                    dr[clouds[y].className] = sum;
                    //dr[clouds[x].name] = clouds[x].nodes.Sum(s => s.weight);
                }

                dr["LemmasInitial"] = 0;
                dr["LemmasAfter"]   = 0;

                dr["LinkRateInitial"] = 0;
                dr["LinkRateAfter"]   = 0;

                table.Rows.Add(dr);
            }

            var ty = type.getEnumListFromFlags <cloudMatrixDataTableType>();

            foreach (cloudMatrixDataTableType t in ty)
            {
                table.SetAdditionalInfoEntry(t.toStringSafe(), t.toStringSafe().imbTitleCamelOperation(true));
            }

            if (type.HasFlag(cloudMatrixDataTableType.initialState))
            {
                table.AddExtra("The table shows the state of the matrix before transformation (filtration).");
            }
            else
            {
                table.AddExtra("The table shows the state of the matrix after transformation (filtration).");
            }

            if (type.HasFlag(cloudMatrixDataTableType.overlapSize))
            {
                table.AddExtra("Values in the table are showing number of lemmas that are common to the clouds (of x and y axis).");
            }
            else if (type.HasFlag(cloudMatrixDataTableType.maxCloudFrequency))
            {
                table.AddExtra("Values in the table are showing highest Cloud Frequency for a term (at x and y axis).");
            }
            else if (type.HasFlag(cloudMatrixDataTableType.minCloudFrequency))
            {
                table.AddExtra("Values in the table are showing lowest Cloud Frequency for a term (at x and y axis).");
            }
            else if (type.HasFlag(cloudMatrixDataTableType.overlapValue))
            {
                table.AddExtra("Values in the table are showing sum of local weights for overlapping terms. The last row contains sum of weights for the class cloud.");
            }

            if (type.HasFlag(cloudMatrixDataTableType.normalizedValues))
            {
                if (type.HasFlag(cloudMatrixDataTableType.overlapSize))
                {
                    table.AddExtra("The values are normalized to 0-1, where 1 is overlap size in initial state for each x,y cell.");
                }
                else
                {
                    table.AddExtra("The values are normalized to 0-1.");
                }
            }
            else
            {
                table.AddExtra("The values are absolute.");
            }


            table.SetAdditionalInfoEntry("Max. CF", MaxCloudFrequency);
            table.SetAdditionalInfoEntry("Min. CF", MinCloudFrequency);
            table.SetAdditionalInfoEntry("Max. Overlap", MaxOverlap);
            table.SetAdditionalInfoEntry("Min. Overlap", MinOverlap);
            return(table);
        }
Example #18
0
        /// <summary>
        /// Transforms the clouds, related
        /// </summary>
        /// <param name="settings">The settings.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="reductionReportName">Name of the reduction report.</param>
        /// <returns>
        /// Notes about reduced weights
        /// </returns>
        public cloudMatrixReductionReport TransformClouds(cloudMatrixSettings settings, ILogBuilder logger, String reductionReportName = "")
        {
            cloudMatrixReductionReport reductions = new cloudMatrixReductionReport();

            reductions.name = reductionReportName;


            instanceCountCollection <String> counter = GetCounter(false);
            List <String> passNames        = new List <string>();
            List <String> removeNames      = new List <string>();
            List <String> removeByLPFNames = new List <string>();
            List <String> setMiniNames     = new List <string>();

            //  lemmaSemanticCloud cloud = this.First().Key;

            MinCloudFrequency = counter.minFreq;
            MaxCloudFrequency = counter.maxFreq;

            Double lowPass = settings.lowPassFilter;

            if (!settings.isActive)
            {
                logger.log("Cloud matrix disabled");
                return(reductions);
            }
            if (settings.isFilterInAdaptiveMode)
            {
                lowPass = (MinCloudFrequency - 1) + lowPass;
                if (lowPass > MaxCloudFrequency)
                {
                    lowPass = MaxCloudFrequency;
                }
                if (lowPass < 1)
                {
                    lowPass = 1;
                }
                logger.log(": Cloud matrix filter in adaptive mode - cut off frequency set: " + lowPass);
            }


            var sorted = counter.getSorted();
            // <------------------------------------------------------------------------------------------ LOW PASS FILTER LIST
            List <String> doNotReduceWeight = new List <string>();

            foreach (String n in sorted) // <--------- performing cut of filter
            {
                if (settings.doCutOffByCloudFrequency)
                {
                    Int32   freq   = counter[n];
                    Boolean passOk = true;

                    if (counter[n] > lowPass)
                    {
                        passOk = false;
                    }
                    if (passOk)
                    {
                        passNames.AddUnique(n);
                    }
                    else
                    {
                        if (settings.doAssignMicroWeightInsteadOfRemoval)
                        {
                            // passNames.AddUnique(n);

                            setMiniNames.AddUnique(n);



                            // reductions.Add("All", n,    "[" + n + "] weight set to the microWeightNoiseGate limit");
//                            doNotReduceWeight.Add(n);
                        }
                        else
                        {
                            removeByLPFNames.AddUnique(n);
                            //reductions.Add("[" + n + "] was removed");
                        }
                    }
                }
                else
                {
                    passNames.Add(n);
                }
            }

            // <------------------------------------------------------------------------------------------ LOW PASS FILTER LIST

            foreach (lemmaSemanticCloud y in this.Get1stKeys())
            {
                y.RebuildIndex();
                y.description = y.description + " filtered version of cloud";

                reductions.Nodes         += y.CountNodes();
                reductions.InitialWeight += y.nodes.Sum(x => x.weight);
            }



            foreach (lemmaSemanticCloud cloud in this.Get1stKeys())
            {
                // <--- apply LPF

                foreach (String setMini in setMiniNames)
                {
                    var node = cloud.GetNode(setMini, true);
                    if (node != null)
                    {
                        reductions.Add(cloud.name, node.name, node.weight, settings.microWeightNoiseGate, cloudMatrixReductionAction.LowPassFilter);
                        node.weight = settings.microWeightNoiseGate;
                    }
                }


                if (settings.doDivideWeightWithCloudFrequency || settings.doUseSquareFunctionOfCF)
                {
                    Int32 rem = 0;
                    foreach (String n in passNames)
                    {
                        var node = cloud.GetNode(n, true);
                        if (node != null)
                        {
                            Double cf = counter[n];

                            if (settings.doDemoteAnyRepeatingSecondaryTerm)
                            {
                                if (cf > 1)
                                {
                                    if (node.type == 1)
                                    {
                                        node.type = 0;
                                        reductions.Add(cloud.name, node.name, node.weight, node.weight, cloudMatrixReductionAction.Demotion);

                                        //node.weight = node.weight * 0.5;
                                    }
                                }
                            }


                            if (settings.doRemoveAnyRepeatingPrimaryTerm)
                            {
                                if (cf > 1)
                                {
                                    if (node.type == 2)
                                    {
                                        reductions.Add(cloud.name, node.name, node.weight, 0, cloudMatrixReductionAction.Demotion);

                                        node.weight = 0;
                                    }
                                }
                            }
                            else if (settings.doDemoteAnyRepeatingPrimaryTerm)
                            {
                                if (cf > 1)
                                {
                                    if (node.type == 2)
                                    {
                                        reductions.Add(cloud.name, node.name, node.weight, node.weight, cloudMatrixReductionAction.Demotion);

                                        //node.weight = node.weight * 0.5;
                                        node.type = 1;
                                    }
                                }
                            }


                            if (!doNotReduceWeight.Contains(n))
                            {
                                if (node.weight > 0)
                                {
                                    //var cfd = cf + 1;

                                    if (cf > 1)
                                    {
                                        Double nw = node.weight;
                                        if (settings.doUseSquareFunctionOfCF)
                                        {
                                            node.weight = node.weight.GetRatio(cf * cf);
                                        }
                                        else
                                        {
                                            node.weight = node.weight.GetRatio(cf);
                                        }
                                        if (nw > node.weight)
                                        {
                                            reductions.Add(cloud.name, node.name, nw, node.weight, cloudMatrixReductionAction.CF_function);
                                            // reductions.Add("Term [" + node.name + "] weight [" + nw.ToString("F5") + "] reduced to [" + node.weight + "] in " + cloud.className + " CF[" + cf + "]");
                                        }
                                    }
                                }

                                if (node.weight > settings.microWeightNoiseGate)
                                {
                                }
                                else
                                {
                                    if (node.weight < settings.microWeightNoiseGate)
                                    {
                                        removeNames.AddUnique(n);
                                        //y.Remove(n);
                                        rem++;
                                    }
                                }
                            }
                        }
                    }
                }
            }

            foreach (lemmaSemanticCloud y in this.Get1stKeys())
            {
                Int32 rem = 0;
                foreach (String n in removeNames)
                {
                    var node = y.GetNode(n);
                    if (y.Remove(n))
                    {
                        rem++;
                        reductions.Add(y.name, node.name, node.weight, 0, cloudMatrixReductionAction.Microweight);
                        //reductions.Add("Term [" + n + "] removed from [" + y.className + "]");
                    }
                }

                foreach (String n in removeByLPFNames)
                {
                    var node = y.GetNode(n);
                    if (y.Remove(n))
                    {
                        rem++;
                        reductions.Add(y.name, node.name, node.weight, 0, cloudMatrixReductionAction.LPFRemoval);
                    }
                }

                if (rem > 0)
                {
                    logger.log(y.className + ": Terms removed[" + rem.ToString("D6") + "] left[" + y.CountNodes().ToString("D6") + "]");
                }
            }

            foreach (lemmaSemanticCloud y in this.Get1stKeys())
            {
                y.RebuildIndex();
                y.description = y.description + " filtered version of cloud";

                //   reductions.Nodes += y.CountNodes();
                reductions.ReducedWeight += y.nodes.Sum(x => x.weight);
            }

            logger.log("Clouds transformation done.");

            return(reductions);
        }
Example #19
0
        /// <summary>
        /// Prepares for parallel execution.
        /// </summary>
        /// <param name="tools">The tools.</param>
        /// <param name="_context">The context.</param>
        public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context)
        {
            if (caseKnowledgeSet == null)
            {
                caseKnowledgeSet = new webProjectKnowledgeSet();
            }

            if (items.Any())
            {
                experimentContext.notes.log("Mining Context was ready already.");
                return(caseKnowledgeSet);
            }
            DateTime startTime = DateTime.Now;

            experimentContext = _context;



            List <webCaseKnowledge> cases = new List <webCaseKnowledge>();

            folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data");

            // <----------------------------------------------------------------------------------------------------------------        [ performing pipeline ]
            experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model");
            foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
            {
                var pipelineContext = GetContextForPipeline(tools, classSet);
                sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>());

                if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject)))
                {
                    throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken");
                }

                var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing
                foreach (var site in sitesForContext)
                {
                    tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>());
                    sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject);

                    webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet);

                    caseKnowledgeSet.Add(webCase);
                    cases.Add(webCase);
                }

                semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge();
                kn.name = classSet.name + "_general";
                kn.relatedItemPureName = classSet.name;
                kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory;
                kn.Deploy(classReportFolder, experimentContext.logger);
                knowledgeByClass.TryAdd(classSet, kn);
            }

            experimentContext.notes.log("Sorting tokens for all sites [in parallel]");
            Parallel.ForEach(tokenBySite.Keys, site =>
            {
                var leafs = site.getAllLeafs();
                foreach (var leaf in leafs)
                {
                    pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken;
                    if (token != null)
                    {
                        tokenBySite[site].Add(token);
                    }
                }
            });

            foreach (var c in cases)
            {
                c.tokens = tokenBySite[c.MCSiteSubject];
            }


            experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]");


            Boolean useIntegratedApproach = false;



            if (useIntegratedApproach)
            {
                var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes);
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger));
                });
            }
            else
            {
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    IDocumentSetClass category             = pair.Key;
                    List <pipelineTaskMCSiteSubject> sites = pair.Value;

                    var lt = BuildLemmaTableForClass(tools, category, sites);
                    lt.Save();
                    // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite);
                });
            }

            experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run");
            tools.SaveCache();


            if (!useIntegratedApproach)
            {
                experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]");



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    BuildChunksForClass(tools, classSet);
                }



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);
                }
            }

            if (tools.operation.doCreateDiagnosticMatrixAtStart)
            {
                experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]");



                folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things");

                List <lemmaSemanticCloud> clouds         = new List <lemmaSemanticCloud>();
                List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>();

                var converter = lemmaSemanticCloud.GetDGMLConverter();

                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);


                    var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource());
                    knowledgeByClass[classSet].semanticCloud.className = classSet.name;
                    clouds.Add(cloud);

                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }
                    else
                    {
                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }



                    knowledgeByClass[classSet].semanticCloudFiltered           = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true);
                    knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name;
                    filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered);
                }

                cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes");
                matrix.build(filteredClouds, experimentContext.logger);

                lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud();
                mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories"));


                var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger);

                var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process");
                File.WriteAllLines(p, reductions);



                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell);


                matrix.ExportTextReports(matrixReport, true, "matrix_cf");
                matrix.ExportTextReports(matrixReport, false, "matrix_cf");

                lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud();
                mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied"));

                if (experimentContext.tools.operation.doUseSimpleGraphs)
                {
                    mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }
                else
                {
                    converter = lemmaSemanticCloud.GetDGMLConverter();

                    converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }


                // <-------- analysis -----------------------------------------------------------------------------------
                DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>();
                foreach (var cl in filteredClouds)
                {
                    freeGraphReport fgReport = new freeGraphReport(cl);
                    fgReport.Save(matrixReport);
                    cloudReports.AddRow(fgReport);
                }
                freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction);
                unifiedReport.Save(matrixReport);
                cloudReports.AddRow(unifiedReport);


                cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds");
                // <-------- analysis -----------------------------------------------------------------------------------



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList());


                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }
                    else
                    {
                        converter = lemmaSemanticCloud.GetDGMLConverter();

                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }



                    //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite);
                }

                instanceCountCollection <String> tfcounter = new instanceCountCollection <string>();
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable();
                    wlt.DefaultView.Sort = "termFrequency desc";
                    var sorted = wlt.DefaultView.ToTable();
                    var tbl    = wlt.GetClonedShema <DataTable>(true);

                    tbl.CopyRowsFrom(sorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell);

                    var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable();
                    cht.DefaultView.Sort = "termFrequency desc";
                    var csorted = cht.DefaultView.ToTable();

                    tbl = cht.GetClonedShema <DataTable>(true);
                    tbl.CopyRowsFrom(csorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell);

                    tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved);


                    knowledgeByClass[classSet].OnBeforeSave();
                }

                List <String> countSorted = tfcounter.getSorted();
                StringBuilder sb          = new StringBuilder();
                foreach (String s in countSorted)
                {
                    sb.AppendLine(String.Format("{1}  :  {0}", s, tfcounter[s]));
                }
                String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens");
                File.WriteAllText(pt, sb.ToString());
            }


            if (tools.operation.doFullDiagnosticReport)
            {
                experimentContext.notes.log("Generating full diagnostic report on classes...");
                DataTable rep = null;
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    rep = this.GetClassKnowledgeReport(classSet, rep);
                }
                rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name);

                rep.AddExtra("Experiment: " + experimentContext.setup.name);

                rep.AddExtra("Info: " + experimentContext.setup.description);

                rep.SetDescription("Structural report for all classes in the experiment");
                rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell);
            }

            classReportFolder.generateReadmeFiles(appManager.AppInfo);


            experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes");
            return(caseKnowledgeSet);
        }