Ejemplo n.º 1
0
        protected void BuildChunksForClass(classifierTools tools, IDocumentSetClass documentSetClass)
        {
            var context = items[documentSetClass.name];

            //lemmaTable.SaveAs(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml", imbSCI.Data.enums.getWritableFileMode.overwrite));
            experimentContext.chunkComposer.reset();
            experimentContext.notes.log("Chunk construction... [" + documentSetClass.name + "]");

            ConcurrentBag <IPipelineTaskSubject> MCStreams = context.exitByLevel[cnt_level.mcTokenStream];  //context.exitSubjects.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcTokenStream, cnt_level.mcChunk }, true); // sites.getAllChildren();  //context.exitSubjects.GetSubjectsOfLevel<IPipelineTaskSubject>(cnt_level.mcTokenStream);

            streamsByCategory.Add(documentSetClass, MCStreams.ToList());



            List <pipelineTaskSubjectContentToken> Chunks = experimentContext.chunkComposer.process(MCStreams.ToSubjectToken(), experimentContext.logger);

            chunksByCategory.Add(documentSetClass, Chunks);

            if (Chunks.Count == 0)
            {
                experimentContext.logger.log("-- no chunks produced for [" + documentSetClass.name + "] -- Stream input count [" + MCStreams.Count + "]");
            }
            else
            {
                experimentContext.notes.log("[" + Chunks.Count + "] chunks constructed for class [" + documentSetClass.name + "]");
            }
        }
Ejemplo n.º 2
0
        public T GetKnowledgeInstance <T>(IDocumentSetClass setClass, kFoldValidationCase validationCase, ILogBuilder logger) where T : class, IWebFVExtractorKnowledge, new()
        {
            T knowledge = GetKnowledgeInstance <T>("class_" + setClass.name, validationCase, WebFVExtractorKnowledgeType.aboutCategory, logger);

            knowledge.relatedItemPureName = setClass.name;
            return(knowledge);
        }
Ejemplo n.º 3
0
        public void SaveAndReset(IDocumentSetClass setClass)
        {
            if (IsSingleFold)
            {
                if (!isUsingGeneralCache)
                {
                    if (!localLemmaResourcePath.isNullOrEmpty())
                    {
                        if (!File.Exists(localLemmaResourcePath))
                        {
                            _lemmaResource.SaveUsedCache(localLemmaResourcePath, true);
                        }
                    }
                }
            }

            if (isUsingGeneralCache)
            {
            }
            else
            {
                if (!_lemmaResource.localCache.isNullOrEmpty())
                {
                    _lemmaResource = null;
                }
            }
        }
Ejemplo n.º 4
0
 public void Add(IDocumentSetClass setClass)
 {
     classNames.Add(setClass.name);
     setClass.classID = items.Count;
     setClass.parent  = this;
     items.Add(setClass);
 }
Ejemplo n.º 5
0
        public IDocumentSetClass GetClass(String className, ILogBuilder logger)
        {
            IDocumentSetClass output = null;

            foreach (IDocumentSetClass cl in items)
            {
                if (cl.name == className)
                {
                    logger.log("Class found by name");
                    output = cl;
                    break;
                }
                if (cl.treeLetterAcronim == className)
                {
                    logger.log("Class found by TLA");
                    output = cl;
                    break;
                }
            }

            if (output == null)
            {
                logger.log("No class found under [" + className + "] name nor tree letter accronim");
            }

            return(output);
        }
Ejemplo n.º 6
0
        protected webLemmaTermTable BuildLemmaTableForClass(classifierTools tools, IDocumentSetClass documentSetClass, List <pipelineTaskMCSiteSubject> sites)
        {
            var context = items[documentSetClass.name];

            experimentContext.notes.log("Master TF-IDF table construction (used for POS flagging)... [" + documentSetClass.name + "]");
            webLemmaTermTable lemmaTable = knowledgeByClass[documentSetClass].WLTableOfIndustryClass; // new webLemmaTermTable(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml"), true, "master_table_" + documentSetClass.name);

            lemmaTable.Clear();
            experimentContext.masterConstructor.process(GetTokensForSites <IPipelineTaskSubject>(sites), cnt_level.mcPage, lemmaTable, tools.GetLemmaResource(), context.logger, false);

            //lemmaTableByClass.TryAdd(documentSetClass, lemmaTable);
            return(lemmaTable);
        }
Ejemplo n.º 7
0
        public void SetLemmaResource(IDocumentSetClass setClass)
        {
            String _localLemmaResourceName = setClass.name + "_general" + ".mtx";

            //  String _localLemmaResourcePath =

            if (isUsingGeneralCache)
            {
                localLemmaResourcePath = cacheFolder.pathFor(cacheLemmaResourcePath, getWritableFileMode.newOrExisting, "Extracted lexic resource with entries used in this experiment/fold");
            }
            else
            {
                localLemmaResourcePath = cacheFolder.pathFor(_localLemmaResourceName, getWritableFileMode.newOrExisting, "Extracted lexic resource with entries used in this experiment/fold");
            }

            lemmaResource = GetLemmaResource(localLemmaResourcePath) as multitextIndexedResolver;
        }
Ejemplo n.º 8
0
        public void OnLoad <T>(folderNode folder, ILogBuilder output) where T : IDocumentSetClass, new()
        {
            folderRoot = folder;

            String pt = folder.pathFor("DocumentSetClasses.txt", imbSCI.Data.enums.getWritableFileMode.newOrExisting, DESC_ListOfIndustries);

            if (File.Exists(pt))
            {
                classNames = File.ReadAllLines(pt).ToList();
            }

            foreach (String className in classNames)
            {
                IDocumentSetClass setClass = className.LoadDataStructure <T>(folderRoot, output);
                setClass.classID = items.Count;
                setClass.parent  = this;
                items.Add(setClass);
            }
        }
Ejemplo n.º 9
0
        protected pipelineModelExecutionContext GetContextForPipeline(classifierTools tools, IDocumentSetClass documentSetClass)
        {
            if (!items.ContainsKey(documentSetClass.name))
            {
                pipelineModelExecutionContext context = machine.run(tools.model, documentSetClass.MCRepositoryName, documentSetClass, new List <String>());

                items.Add(documentSetClass.name, context);
            }

            return(items[documentSetClass.name]);
        }
Ejemplo n.º 10
0
 public override tfidfFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
 {
     throw new NotImplementedException();
 }
Ejemplo n.º 11
0
 public DocumentSetCaseCollection(IDocumentSetClass _setClass)
 {
     setClass     = _setClass;
     rightClassID = setClass.classID;
 }
Ejemplo n.º 12
0
 public void deployClass(IDocumentSetClass site)
 {
     name = site.name.ToUpper();
 }
Ejemplo n.º 13
0
 /// <summary>
 /// Gets the context.
 /// </summary>
 /// <param name="tools">The tools.</param>
 /// <param name="documentSetClass">The document set class.</param>
 /// <returns></returns>
 public pipelineModelExecutionContext GetContext(classifierTools tools, IDocumentSetClass documentSetClass)
 {
     return(GetContextForPipeline(tools, documentSetClass));
 }
Ejemplo n.º 14
0
        public override semanticFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
        {
            semanticFVExtractorKnowledge knowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(documentSetClass, vCaseColl.kFoldCase, logger);

            knowledge.SetRebuild(!tools.DoUseExistingKnowledge);


            if (knowledge.ShouldBuildAny())
            {
                DocumentSetCaseCollection dSetCol = new DocumentSetCaseCollection(documentSetClass);


                var context = tools.context.pipelineCollection.GetContext(tools, documentSetClass);

                //var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ConvertList<IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();
                var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ToList();
                List <pipelineTaskMCSiteSubject> ISites = sites.ConvertList <IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();

                List <pipelineTaskMCSiteSubject> fSites = vCaseColl.FilterSites(ISites);


                dSetCol.deploy(vCaseColl, validationCase, fSites, classes);

                List <webLemmaTermTable> tables = new List <webLemmaTermTable>();
                //List<webLemmaTermTable> chunkTables = new List<webLemmaTermTable>();



                foreach (DocumentSetCase vc in dSetCol)
                {
                    semanticFVExtractorKnowledge cKnowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(vc, validationCase, logger);
                    DoMakeKnowledgeForCase(vc, tools, dSetCol, logger);
                    tables.Add(cKnowledge.WLTableOfIndustryClass);
                }

                var tbl = tables.GetMergedLemmaTable(knowledge.name, logger);
                termTableConstructor.recompute(knowledge.WLTableOfIndustryClass, logger, false, tbl.GetList());



                DoMakeKnowledge(fSites, tools, knowledge, logger);
            }

            //  SetKnowledge(knowledge);
            //knowledge.OnBeforeSave();

            logger.log("[ALTPROC] Feature Extraction by [" + name + "][" + vCaseColl.kFoldCase.name + "][" + documentSetClass.name + "] done for " + vCaseColl.className);

            return(knowledge);
        }
Ejemplo n.º 15
0
 public webCaseKnowledge(pipelineTaskMCSiteSubject _MCSiteSubject, IDocumentSetClass classSet)
 {
     SetSiteSubject(_MCSiteSubject);
     name     = MCSite.domainInfo.domainRootName;
     industry = classSet;
 }
Ejemplo n.º 16
0
        public static DataTable GetClassKnowledgeReport(this DocumentSetPipelineCollection pipelineCollection, IDocumentSetClass caseSet, DataTable output = null)
        {
            if (output == null)
            {
                output = new DataTable();
                output.SetTitle(caseSet.name);
                output.Add("Name", "Name of class or web site", "", typeof(String), imbSCI.Core.enums.dataPointImportance.normal, "", "Name").SetGroup("Repository").SetWidth(25).SetUnit("");
                output.Add("Sites", "Number of sites in cateogry - or 1 for single site entry", "|C_ds|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Sites").SetGroup("Repository").SetWidth(10).SetUnit("n");
                output.Add("Pages", "Total number of pages detected in the repository", "|C_d|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Pages Crawled").SetGroup("Repository").SetWidth(15).SetUnit("n");
                output.Add("PagesValid", "Number of pages used for the category or site", "|C_dv|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Pages Used").SetGroup("Pipeline").SetWidth(15).SetUnit("n");
                output.Add("Blocks", "Number of blocks for category or site", "|C_b|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "").SetGroup("Pipeline").SetWidth(10).SetUnit("n");
                output.Add("Streams", "Number of streams for category or site", "|C_ts|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "").SetGroup("Pipeline").SetWidth(10).SetUnit("n");
                output.Add("Tokens", "Number of tokens for category or site", "|C_t|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "").SetGroup("Pipeline").SetWidth(10).SetUnit("n");
                output.Add("Chunks", "Number of chunks for category - disabled for sites", "|C_c|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "").SetGroup("NLP").SetWidth(10).SetUnit("n");
                output.Add("OnlyLetters", "Number of tokens for category or site with onlyLetter tag", "|C_ttl|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Only Letters").SetGroup("Only Letters").SetWidth(10).SetUnit("n");
                output.Add("OnlyLettersResolved", "Number of tokens resolved by morphosyntactic resource", "|C_ttlr|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Accepted Tokens").SetGroup("Only Letters").SetWidth(10).SetUnit("n");
                output.Add("OnlyLettersUnresolved", "Number of tokens unresolved by morphosyntactic resource", "|C_ttlu|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Dismissed").SetGroup("Only Letters").SetWidth(10).SetUnit("n");
                output.Add("Numbers", "Number of tokens for category or site with a numeric content tag", "|C_ttn|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Number").SetGroup("Other").SetWidth(10).SetUnit("n");
                output.Add("Symbols", "Number of tokens for category or site with a symbolic content tag", "|C_tts|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Symbols").SetGroup("Other").SetWidth(10).SetUnit("n");
                output.Add("Business", "Number of tokens for category or site with any dat_business tag", "|C_ttb|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Business tags").SetGroup("Special").SetWidth(10).SetUnit("n");
                output.Add("Potential", "Number of tokens for category or site with any tkn_potential data point tag", "|C_ttp|", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Potential tags").SetGroup("Special").SetWidth(10).SetUnit("n");
            }
            else
            {
                output.SetTitle("Class Set Report");
                output.TableName = "multi_class_report";
            }

            output.SetAdditionalInfoEntry("Class " + caseSet.treeLetterAcronim + " name", caseSet.name);
            output.SetAdditionalInfoEntry("Class " + caseSet.treeLetterAcronim + " repo", caseSet.MCRepositoryName);

            var sites = pipelineCollection.sitesByCategory[caseSet].ToList();
            PipelineReportForClass repForClass = new PipelineReportForClass();

            repForClass.deployClass(caseSet);
            repForClass.Chunks = pipelineCollection.chunksByCategory[caseSet].Count();
            foreach (var site in sites)
            {
                var dr = output.NewRow();

                PipelineReportForClass repForSite = new PipelineReportForClass();
                repForSite.deploySite(site);
                repForSite.deployTokens(pipelineCollection.tokenBySite[site as pipelineTaskMCSiteSubject]);


                repForSite.SetDataRow(dr);

                output.Rows.Add(dr);
                repForClass.sum(repForSite);
            }

            var drc = output.NewRow();

            repForClass.SetDataRow(drc);
            output.Rows.Add(drc);
            output.GetRowMetaSet().AddUnit(new dataValueMatchCriterionDynamicStyle <String, DataRowInReportTypeEnum>(new String[] { repForClass.name }, DataRowInReportTypeEnum.dataHighlightA, "Name"));

            return(output);
        }
Ejemplo n.º 17
0
        /// <summary>
        /// Prepares for parallel execution.
        /// </summary>
        /// <param name="tools">The tools.</param>
        /// <param name="_context">The context.</param>
        public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context)
        {
            if (caseKnowledgeSet == null)
            {
                caseKnowledgeSet = new webProjectKnowledgeSet();
            }

            if (items.Any())
            {
                experimentContext.notes.log("Mining Context was ready already.");
                return(caseKnowledgeSet);
            }
            DateTime startTime = DateTime.Now;

            experimentContext = _context;



            List <webCaseKnowledge> cases = new List <webCaseKnowledge>();

            folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data");

            // <----------------------------------------------------------------------------------------------------------------        [ performing pipeline ]
            experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model");
            foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
            {
                var pipelineContext = GetContextForPipeline(tools, classSet);
                sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>());

                if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject)))
                {
                    throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken");
                }

                var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing
                foreach (var site in sitesForContext)
                {
                    tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>());
                    sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject);

                    webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet);

                    caseKnowledgeSet.Add(webCase);
                    cases.Add(webCase);
                }

                semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge();
                kn.name = classSet.name + "_general";
                kn.relatedItemPureName = classSet.name;
                kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory;
                kn.Deploy(classReportFolder, experimentContext.logger);
                knowledgeByClass.TryAdd(classSet, kn);
            }

            experimentContext.notes.log("Sorting tokens for all sites [in parallel]");
            Parallel.ForEach(tokenBySite.Keys, site =>
            {
                var leafs = site.getAllLeafs();
                foreach (var leaf in leafs)
                {
                    pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken;
                    if (token != null)
                    {
                        tokenBySite[site].Add(token);
                    }
                }
            });

            foreach (var c in cases)
            {
                c.tokens = tokenBySite[c.MCSiteSubject];
            }


            experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]");


            Boolean useIntegratedApproach = false;



            if (useIntegratedApproach)
            {
                var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes);
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger));
                });
            }
            else
            {
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    IDocumentSetClass category             = pair.Key;
                    List <pipelineTaskMCSiteSubject> sites = pair.Value;

                    var lt = BuildLemmaTableForClass(tools, category, sites);
                    lt.Save();
                    // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite);
                });
            }

            experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run");
            tools.SaveCache();


            if (!useIntegratedApproach)
            {
                experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]");



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    BuildChunksForClass(tools, classSet);
                }



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);
                }
            }

            if (tools.operation.doCreateDiagnosticMatrixAtStart)
            {
                experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]");



                folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things");

                List <lemmaSemanticCloud> clouds         = new List <lemmaSemanticCloud>();
                List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>();

                var converter = lemmaSemanticCloud.GetDGMLConverter();

                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);


                    var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource());
                    knowledgeByClass[classSet].semanticCloud.className = classSet.name;
                    clouds.Add(cloud);

                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }
                    else
                    {
                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }



                    knowledgeByClass[classSet].semanticCloudFiltered           = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true);
                    knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name;
                    filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered);
                }

                cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes");
                matrix.build(filteredClouds, experimentContext.logger);

                lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud();
                mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories"));


                var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger);

                var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process");
                File.WriteAllLines(p, reductions);



                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell);


                matrix.ExportTextReports(matrixReport, true, "matrix_cf");
                matrix.ExportTextReports(matrixReport, false, "matrix_cf");

                lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud();
                mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied"));

                if (experimentContext.tools.operation.doUseSimpleGraphs)
                {
                    mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }
                else
                {
                    converter = lemmaSemanticCloud.GetDGMLConverter();

                    converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }


                // <-------- analysis -----------------------------------------------------------------------------------
                DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>();
                foreach (var cl in filteredClouds)
                {
                    freeGraphReport fgReport = new freeGraphReport(cl);
                    fgReport.Save(matrixReport);
                    cloudReports.AddRow(fgReport);
                }
                freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction);
                unifiedReport.Save(matrixReport);
                cloudReports.AddRow(unifiedReport);


                cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds");
                // <-------- analysis -----------------------------------------------------------------------------------



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList());


                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }
                    else
                    {
                        converter = lemmaSemanticCloud.GetDGMLConverter();

                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }



                    //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite);
                }

                instanceCountCollection <String> tfcounter = new instanceCountCollection <string>();
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable();
                    wlt.DefaultView.Sort = "termFrequency desc";
                    var sorted = wlt.DefaultView.ToTable();
                    var tbl    = wlt.GetClonedShema <DataTable>(true);

                    tbl.CopyRowsFrom(sorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell);

                    var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable();
                    cht.DefaultView.Sort = "termFrequency desc";
                    var csorted = cht.DefaultView.ToTable();

                    tbl = cht.GetClonedShema <DataTable>(true);
                    tbl.CopyRowsFrom(csorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell);

                    tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved);


                    knowledgeByClass[classSet].OnBeforeSave();
                }

                List <String> countSorted = tfcounter.getSorted();
                StringBuilder sb          = new StringBuilder();
                foreach (String s in countSorted)
                {
                    sb.AppendLine(String.Format("{1}  :  {0}", s, tfcounter[s]));
                }
                String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens");
                File.WriteAllText(pt, sb.ToString());
            }


            if (tools.operation.doFullDiagnosticReport)
            {
                experimentContext.notes.log("Generating full diagnostic report on classes...");
                DataTable rep = null;
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    rep = this.GetClassKnowledgeReport(classSet, rep);
                }
                rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name);

                rep.AddExtra("Experiment: " + experimentContext.setup.name);

                rep.AddExtra("Info: " + experimentContext.setup.description);

                rep.SetDescription("Structural report for all classes in the experiment");
                rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell);
            }

            classReportFolder.generateReadmeFiles(appManager.AppInfo);


            experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes");
            return(caseKnowledgeSet);
        }