Beispiel #1
0
        protected void BuildChunksForClass(classifierTools tools, IDocumentSetClass documentSetClass)
        {
            var context = items[documentSetClass.name];

            //lemmaTable.SaveAs(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml", imbSCI.Data.enums.getWritableFileMode.overwrite));
            experimentContext.chunkComposer.reset();
            experimentContext.notes.log("Chunk construction... [" + documentSetClass.name + "]");

            ConcurrentBag <IPipelineTaskSubject> MCStreams = context.exitByLevel[cnt_level.mcTokenStream];  //context.exitSubjects.GetSubjectChildrenTokenType<pipelineTaskSubjectContentToken, IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcTokenStream, cnt_level.mcChunk }, true); // sites.getAllChildren();  //context.exitSubjects.GetSubjectsOfLevel<IPipelineTaskSubject>(cnt_level.mcTokenStream);

            streamsByCategory.Add(documentSetClass, MCStreams.ToList());



            List <pipelineTaskSubjectContentToken> Chunks = experimentContext.chunkComposer.process(MCStreams.ToSubjectToken(), experimentContext.logger);

            chunksByCategory.Add(documentSetClass, Chunks);

            if (Chunks.Count == 0)
            {
                experimentContext.logger.log("-- no chunks produced for [" + documentSetClass.name + "] -- Stream input count [" + MCStreams.Count + "]");
            }
            else
            {
                experimentContext.notes.log("[" + Chunks.Count + "] chunks constructed for class [" + documentSetClass.name + "]");
            }
        }
        /// <summary>
        /// Performing post processing of FV knowledge
        /// </summary>
        /// <param name="validationCase">The validation case.</param>
        /// <param name="tools">The tools.</param>
        /// <param name="logger">The logger.</param>
        public override void DoFVPostProcessing(kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
        {
            List <lemmaSemanticCloud> clouds = new List <lemmaSemanticCloud>();

            foreach (var docClass in validationCase.context.classes.GetClasses())
            {
                var knowledge = validationCase.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(docClass, validationCase, logger);
                knowledge.semanticCloudFiltered = knowledge.semanticCloud.Clone();
                clouds.Add(knowledge.semanticCloudFiltered);
                knowledge.semanticCloud.className         = docClass.name;
                knowledge.semanticCloudFiltered.className = docClass.name + "flt";
                if (settings.semanticCloudFilter.isActive)
                {
                    knowledge.semanticCloudFiltered.description = "Semantic cloud filtered with cloud matrix";
                }
                else
                {
                    knowledge.semanticCloudFiltered.description = "Semantic cloud filter is off - this is initial cloud";
                }
            }
            if (settings.semanticCloudFilter.isActive)
            {
                logger.log(validationCase.name + ": Cloud matrix creation starts...");
                cloudMatrix matrix = new cloudMatrix(validationCase.name, "Cloud overlap matrix of [" + clouds.Count + "] for fold [" + validationCase.name + "] of experiment [" + validationCase.context.setup.name + "]");

                matrix.build(clouds, logger);

                matrix.TransformClouds(settings.semanticCloudFilter, logger);


                if (tools.operation.doMakeGraphForClassClouds)
                {
                    foreach (var cloud in clouds)
                    {
                        if (tools.operation.doUseSimpleGraphs)
                        {
                            cloud.GetSimpleGraph(true).Save(validationCase.caseFolder.pathFor("class_" + cloud.className + "_reducedCloud", getWritableFileMode.overwrite));
                        }
                        else
                        {
                            var converter = lemmaSemanticCloud.GetDGMLConverter();
                            converter.ConvertToDMGL(cloud).Save(validationCase.caseFolder.pathFor("class_" + cloud.className + "_reducedCloud", getWritableFileMode.overwrite));
                        }
                    }
                }


                //logger.log(validationCase.name + ": Cloud matrix report creation ...");
                // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_norm_initial");
                // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_abs_initial");
                // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.normalizedValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_norm_reduced");
                // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_overlap_abs_reduced");
                // matrix.BuildTable(settings.semanticCloudFilter, cloudMatrixDataTableType.stateAfterReduction | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.normalizedValues).GetReportAndSave(validationCase.folder, appManager.AppInfo, "matrix_CF_norm_reduced");
                // logger.log(validationCase.name + ": Cloud matrix report done.");
            }
            else
            {
                logger.log(validationCase.name + ": Cloud matrix is not active");
            }
        }
Beispiel #3
0
        protected pipelineModelExecutionContext GetContextForPipeline(classifierTools tools, IDocumentSetClass documentSetClass)
        {
            if (!items.ContainsKey(documentSetClass.name))
            {
                pipelineModelExecutionContext context = machine.run(tools.model, documentSetClass.MCRepositoryName, documentSetClass, new List <String>());

                items.Add(documentSetClass.name, context);
            }

            return(items[documentSetClass.name]);
        }
        public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger)
        {
            var state = states.SetState(trainingSet, GetExperimentSufix());

            _distance = new SquareEuclidean();
            var kNearest = new KNearestNeighbors <Double[]>(k: setup.kNN_k, distance: _distance);

            kNearest.Learn(state.data.inputs, state.data.outputs);
            state.machine = kNearest;

            state.SaveState();
        }
Beispiel #5
0
        protected webLemmaTermTable BuildLemmaTableForClass(classifierTools tools, IDocumentSetClass documentSetClass, List <pipelineTaskMCSiteSubject> sites)
        {
            var context = items[documentSetClass.name];

            experimentContext.notes.log("Master TF-IDF table construction (used for POS flagging)... [" + documentSetClass.name + "]");
            webLemmaTermTable lemmaTable = knowledgeByClass[documentSetClass].WLTableOfIndustryClass; // new webLemmaTermTable(experimentContext.folder.pathFor("master_table_" + documentSetClass.name + ".xml"), true, "master_table_" + documentSetClass.name);

            lemmaTable.Clear();
            experimentContext.masterConstructor.process(GetTokensForSites <IPipelineTaskSubject>(sites), cnt_level.mcPage, lemmaTable, tools.GetLemmaResource(), context.logger, false);

            //lemmaTableByClass.TryAdd(documentSetClass, lemmaTable);
            return(lemmaTable);
        }
Beispiel #6
0
        public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger)
        {
            var state = states.SetState(trainingSet, GetExperimentSufix());

            if (isMultinominal)
            {
                NaiveBayesLearning <GeneralizedBetaDistribution> teacher = new NaiveBayesLearning <GeneralizedBetaDistribution>();

                // Set options for the component distributions
                teacher.Options.InnerOption = new NormalOptions
                {
                    Regularization = 1e-5 // to avoid zero variances
                };

                // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
                teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)
                _teacher = teacher;

                // Learn a machine
                //  state.machine = teacher.Learn(state.data.inputs, state.data.outputs);
            }
            else
            {
                NaiveBayesLearning <NormalDistribution> teacher = new NaiveBayesLearning <NormalDistribution>();

                // Set options for the component distributions
                teacher.Options.InnerOption = new NormalOptions
                {
                    Regularization = 1e-5 // to avoid zero variances
                };

                // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
                teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)
                _teacher = teacher;

                // Learn a machine
                state.machine = teacher.Learn(state.data.inputs, state.data.outputs);
            }


            state.SaveState();
        }
Beispiel #7
0
        /// <summary>
        /// Sets the execution context.
        /// </summary>
        /// <param name="_manager">The manager.</param>
        /// <param name="_setup">The setup.</param>
        /// <param name="_tools">The tools.</param>
        /// <param name="_classes">The classes.</param>
        /// <param name="sufix">The sufix.</param>
        /// <param name="chunker">The chunker.</param>
        /// <param name="_masterExtractor">The master extractor.</param>
        /// <param name="_logger">The logger.</param>
        public void SetExecutionContext(experimentManager _manager, experimentSetup _setup, classifierTools _tools, DocumentSetClasses _classes, String sufix, chunkComposerBasic chunker, semanticFVExtractor _masterExtractor, ILogBuilder _logger = null)
        {
            if (_logger == null)
            {
                _logger = new builderForLog();
                aceLog.consoleControl.setAsOutput(_logger, _setup.name);
            }
            logger        = _logger;
            chunkComposer = chunker;
            setup         = _setup;
            tools         = _tools;
            tools.context = this;
            classes       = _classes;
            // masterConstructor = _masterExtractor.termTableConstructor;



            masterExtractor   = _setup.featureVectorExtractors_semantic.First();
            masterConstructor = masterExtractor.termTableConstructor;
            manager           = _manager;
            String expContextName = "exp_" + setup.name.add(sufix, "_");

            folder           = manager.folder.Add(expContextName, "Experiment " + setup.name, "Directory with all information on the experiment [" + setup.name + "]");
            errorNotesFolder = folder.Add("errors", "Error logs", "Directory with error reports produced if an exception occours. Normally, if everything was ok this folder should have only two files inside: directory_readme.txt and empty: note.txt).");
            errorNotes       = new experimentNotes(errorNotesFolder, "Notes (logs) about critical and non-critical errors that happen during experiment execution. If everything was ok - this file should remain empty");

            notes = new experimentNotes(folder, "Notes on experiment setup and execution log");
            aceLog.consoleControl.setAsOutput(notes, "Notes");

            notes.log("Experiment [" + expContextName + "] initiated");
            notes.AppendLine("About: " + setup.description);

            notes.AppendHorizontalLine();



            notes.SaveNote();
            notes.AppendHeading("Feature extraction models");

            var lnsc = chunkComposer.DescribeSelf();

            lnsc.ForEach(x => notes.AppendLine(x));
            notes.AppendLine(" - ");


            List <String> mdn = new List <string>();

            foreach (var md in setup.models)
            {
                if (mdn.Contains(md.name))
                {
                    md.name += "_" + mdn.Count.ToString();
                }
                else
                {
                    mdn.Add(md.name);
                }
            }

            foreach (var md in setup.models)
            {
                String prefix = md.name;
                md.classes = classes;
                md.BuildFeatureVectorDefinition();

                var lns = md.DescribeSelf();
                lns.ForEach(x => notes.AppendLine(x));



                kFoldValidationCollection validationCases = classes.BuildValidationCases(prefix, setup.validationSetup.k, tools.DoDebug, logger, folder, setup.validationSetup.randomize);
                validationCases.pipelineCollection = pipelineCollection;

                validationCases.connectContext(this, md);

                validationCollections.Add(md.name, validationCases);


                //md.postClassifiers = setup.classifiers;
            }
        }
 public abstract void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger);
Beispiel #9
0
        public override void DoMakeKnowledge(List <pipelineTaskMCSiteSubject> subjects, classifierTools tools, tfidfFVExtractorKnowledge knowledge, ILogBuilder logger)
        {
            knowledge.WLTableOfIndustryClass.Clear();
            knowledge.WLTableOfIndustryClass = constructor.process(subjects, cnt_level.mcPage, knowledge.WLTableOfIndustryClass, tools.GetLemmaResource(), logger, false);

            logger.log("TF-IDF built for [" + knowledge.name + "]");
        }
Beispiel #10
0
 public override void DoFVPostProcessing(kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
 {
 }
Beispiel #11
0
 public override tfidfFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
 {
     throw new NotImplementedException();
 }
Beispiel #12
0
 /// <summary>
 /// Gets the context.
 /// </summary>
 /// <param name="tools">The tools.</param>
 /// <param name="documentSetClass">The document set class.</param>
 /// <returns></returns>
 public pipelineModelExecutionContext GetContext(classifierTools tools, IDocumentSetClass documentSetClass)
 {
     return(GetContextForPipeline(tools, documentSetClass));
 }
Beispiel #13
0
        /// <summary>
        /// Prepares for parallel execution.
        /// </summary>
        /// <param name="tools">The tools.</param>
        /// <param name="_context">The context.</param>
        public webProjectKnowledgeSet PrepareForParallelExecution(classifierTools tools, experimentExecutionContext _context)
        {
            if (caseKnowledgeSet == null)
            {
                caseKnowledgeSet = new webProjectKnowledgeSet();
            }

            if (items.Any())
            {
                experimentContext.notes.log("Mining Context was ready already.");
                return(caseKnowledgeSet);
            }
            DateTime startTime = DateTime.Now;

            experimentContext = _context;



            List <webCaseKnowledge> cases = new List <webCaseKnowledge>();

            folderNode classReportFolder = experimentContext.folder.Add("General", "General and diagnostic reports", "The folder contains general (outside k-folds) reports on analysied industries (categories), web sites and other diagnostic data");

            // <----------------------------------------------------------------------------------------------------------------        [ performing pipeline ]
            experimentContext.notes.log("Executing the Mining Context decomposition with the pipeline model");
            foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
            {
                var pipelineContext = GetContextForPipeline(tools, classSet);
                sitesByCategory.Add(classSet, new List <pipelineTaskMCSiteSubject>());

                if (!pipelineContext.exitByType.ContainsKey(typeof(pipelineTaskMCSiteSubject)))
                {
                    throw new aceGeneralException("Pipeline context output contains no web site subjects! Check the pipeline Site Task constructor.", null, pipelineContext, "Pipeline broken");
                }

                var sitesForContext = pipelineContext.exitByType[typeof(pipelineTaskMCSiteSubject)]; // <----- preparing
                foreach (var site in sitesForContext)
                {
                    tokenBySite.Add(site as pipelineTaskMCSiteSubject, new ConcurrentBag <pipelineTaskSubjectContentToken>());
                    sitesByCategory[classSet].Add(site as pipelineTaskMCSiteSubject);

                    webCaseKnowledge webCase = new webCaseKnowledge(site as pipelineTaskMCSiteSubject, classSet);

                    caseKnowledgeSet.Add(webCase);
                    cases.Add(webCase);
                }

                semanticFVExtractorKnowledge kn = new semanticFVExtractorKnowledge();
                kn.name = classSet.name + "_general";
                kn.relatedItemPureName = classSet.name;
                kn.type = WebFVExtractorKnowledgeType.aboutCompleteCategory;
                kn.Deploy(classReportFolder, experimentContext.logger);
                knowledgeByClass.TryAdd(classSet, kn);
            }

            experimentContext.notes.log("Sorting tokens for all sites [in parallel]");
            Parallel.ForEach(tokenBySite.Keys, site =>
            {
                var leafs = site.getAllLeafs();
                foreach (var leaf in leafs)
                {
                    pipelineTaskSubjectContentToken token = leaf as pipelineTaskSubjectContentToken;
                    if (token != null)
                    {
                        tokenBySite[site].Add(token);
                    }
                }
            });

            foreach (var c in cases)
            {
                c.tokens = tokenBySite[c.MCSiteSubject];
            }


            experimentContext.notes.log("Building diagnostic TF-IDF master tables for all classes [in parallel]");


            Boolean useIntegratedApproach = false;



            if (useIntegratedApproach)
            {
                var valCase = experimentContext.validationCollections[experimentContext.masterExtractor.name].GetDiagnosticCase(experimentContext.classes);
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    knowledgeByClass.TryAdd(pair.Key, experimentContext.masterExtractor.DoFVExtractionForClassViaCases(valCase.trainingCases[pair.Key.classID], pair.Key, valCase, experimentContext.tools, experimentContext.logger));
                });
            }
            else
            {
                Parallel.ForEach(sitesByCategory, pair =>
                {
                    IDocumentSetClass category             = pair.Key;
                    List <pipelineTaskMCSiteSubject> sites = pair.Value;

                    var lt = BuildLemmaTableForClass(tools, category, sites);
                    lt.Save();
                    // lt.SaveAs(classReportFolder.pathFor(lt.info.Name), imbSCI.Data.enums.getWritableFileMode.overwrite);
                });
            }

            experimentContext.notes.log("Saving lexic resource cache subset - for later reuse in case of repeated experiment run");
            tools.SaveCache();


            if (!useIntegratedApproach)
            {
                experimentContext.notes.log("Performing chunk construction for all web sites in all categories [in serial]");



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    BuildChunksForClass(tools, classSet);
                }



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);
                }
            }

            if (tools.operation.doCreateDiagnosticMatrixAtStart)
            {
                experimentContext.notes.log("Performing diagnostic analysis on all categories...[doCreateDiagnosticMatrixAtStart=true]");



                folderNode matrixReport = classReportFolder.Add("clouds", "More reports on semantic cloud", "Directory contains exported DirectedGraphs, varous matrix derivates, combined cloud and other diagnostic things");

                List <lemmaSemanticCloud> clouds         = new List <lemmaSemanticCloud>();
                List <lemmaSemanticCloud> filteredClouds = new List <lemmaSemanticCloud>();

                var converter = lemmaSemanticCloud.GetDGMLConverter();

                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    // experimentContext.masterExtractor.chunkTableConstructor.process(chunksByCategory[classSet], cnt_level.mcPage, knowledgeByClass[classSet].WLChunkTableOfIndustryClass, null, experimentContext.logger, false);


                    var cloud = experimentContext.masterExtractor.CloudConstructor.process(knowledgeByClass[classSet].WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList(), tools.GetLemmaResource());
                    knowledgeByClass[classSet].semanticCloud.className = classSet.name;
                    clouds.Add(cloud);

                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }
                    else
                    {
                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_initial_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "Initial version of full-sample set, diagnostic Semantic Cloud for category [" + classSet.name + "]"));
                    }



                    knowledgeByClass[classSet].semanticCloudFiltered           = knowledgeByClass[classSet].semanticCloud.CloneIntoType <lemmaSemanticCloud>(true);
                    knowledgeByClass[classSet].semanticCloudFiltered.className = classSet.name;
                    filteredClouds.Add(knowledgeByClass[classSet].semanticCloudFiltered);
                }

                cloudMatrix matrix = new cloudMatrix("CloudMatrix", "Diagnostic cloud matrix created from the complete sample set of [" + clouds.Count() + "] classes");
                matrix.build(filteredClouds, experimentContext.logger);

                lemmaSemanticCloud mergedCloudInitial = matrix.GetUnifiedCloud();
                mergedCloudInitial.Save(matrixReport.pathFor("unified_initial_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories"));


                var reductions = matrix.TransformClouds(experimentContext.masterExtractor.settings.semanticCloudFilter, experimentContext.logger);

                var p = matrixReport.pathFor("reductions_nodes.txt", imbSCI.Data.enums.getWritableFileMode.overwrite, "Report on Cloud Matrix transformation process");
                File.WriteAllLines(p, reductions);



                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.maxCloudFrequency | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_max_cf_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapSize | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_size_initial", true, experimentContext.tools.operation.doReportsInParalell);

                matrix.BuildTable(experimentContext.masterExtractor.settings.semanticCloudFilter, cloudMatrixDataTableType.initialState | cloudMatrixDataTableType.overlapValue | cloudMatrixDataTableType.absoluteValues).GetReportAndSave(matrixReport, appManager.AppInfo, "matrix_overlap_value_initial", true, experimentContext.tools.operation.doReportsInParalell);


                matrix.ExportTextReports(matrixReport, true, "matrix_cf");
                matrix.ExportTextReports(matrixReport, false, "matrix_cf");

                lemmaSemanticCloud mergedCloudAfterReduction = matrix.GetUnifiedCloud();
                mergedCloudAfterReduction.Save(matrixReport.pathFor("unified_reduced_cloud.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized object -Version of all-categories diagnostic Semantic Cloud, after Cloud Matrix filter was applied"));

                if (experimentContext.tools.operation.doUseSimpleGraphs)
                {
                    mergedCloudInitial.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }
                else
                {
                    converter = lemmaSemanticCloud.GetDGMLConverter();

                    converter.ConvertToDMGL(mergedCloudInitial).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                }


                // <-------- analysis -----------------------------------------------------------------------------------
                DataTableTypeExtended <freeGraphReport> cloudReports = new DataTableTypeExtended <freeGraphReport>();
                foreach (var cl in filteredClouds)
                {
                    freeGraphReport fgReport = new freeGraphReport(cl);
                    fgReport.Save(matrixReport);
                    cloudReports.AddRow(fgReport);
                }
                freeGraphReport unifiedReport = new freeGraphReport(mergedCloudAfterReduction);
                unifiedReport.Save(matrixReport);
                cloudReports.AddRow(unifiedReport);


                cloudReports.GetReportAndSave(matrixReport, appManager.AppInfo, "analysis_SemanticClouds");
                // <-------- analysis -----------------------------------------------------------------------------------



                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var cloud = knowledgeByClass[classSet].semanticCloudFiltered; // .WLChunkTableOfIndustryClass, knowledgeByClass[classSet].WLTableOfIndustryClass, knowledgeByClass[classSet].semanticCloud, experimentContext.logger, tokenBySite.Keys.ToList());


                    if (experimentContext.tools.operation.doUseSimpleGraphs)
                    {
                        cloud.GetSimpleGraph(true).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }
                    else
                    {
                        converter = lemmaSemanticCloud.GetDGMLConverter();

                        converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("unified_initial_cloud", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphML file - unified Semantic Cloud, before Cloud Matrix filter was applied - Open this in VisualStudo)"));
                    }



                    //converter.ConvertToDMGL(cloud).Save(matrixReport.pathFor("cloud_reduced_" + classSet.name, imbSCI.Data.enums.getWritableFileMode.none, "DirectedGraphML file - Initial version of Semantic Cloud built as union of full-sample set Semantic Clouds of all categories (Open this with VS)"), imbSCI.Data.enums.getWritableFileMode.overwrite);
                }

                instanceCountCollection <String> tfcounter = new instanceCountCollection <string>();
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    var wlt = knowledgeByClass[classSet].WLTableOfIndustryClass.GetDataTable();
                    wlt.DefaultView.Sort = "termFrequency desc";
                    var sorted = wlt.DefaultView.ToTable();
                    var tbl    = wlt.GetClonedShema <DataTable>(true);

                    tbl.CopyRowsFrom(sorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_WebLemma", true, experimentContext.tools.operation.doReportsInParalell);

                    var cht = knowledgeByClass[classSet].WLChunkTableOfIndustryClass.GetDataTable();
                    cht.DefaultView.Sort = "termFrequency desc";
                    var csorted = cht.DefaultView.ToTable();

                    tbl = cht.GetClonedShema <DataTable>(true);
                    tbl.CopyRowsFrom(csorted, 0, 100);
                    tbl.GetReportAndSave(classReportFolder, appManager.AppInfo, classSet.name + "_Chunks", true, experimentContext.tools.operation.doReportsInParalell);

                    tfcounter.AddInstanceRange(knowledgeByClass[classSet].WLTableOfIndustryClass.unresolved);


                    knowledgeByClass[classSet].OnBeforeSave();
                }

                List <String> countSorted = tfcounter.getSorted();
                StringBuilder sb          = new StringBuilder();
                foreach (String s in countSorted)
                {
                    sb.AppendLine(String.Format("{1}  :  {0}", s, tfcounter[s]));
                }
                String pt = classReportFolder.pathFor("unresolved_tokens.txt", imbSCI.Data.enums.getWritableFileMode.none, "Cloud Frequency list of all unresolved letter-only tokens");
                File.WriteAllText(pt, sb.ToString());
            }


            if (tools.operation.doFullDiagnosticReport)
            {
                experimentContext.notes.log("Generating full diagnostic report on classes...");
                DataTable rep = null;
                foreach (IDocumentSetClass classSet in experimentContext.classes.GetClasses())
                {
                    rep = this.GetClassKnowledgeReport(classSet, rep);
                }
                rep.SetAdditionalInfoEntry("Experiment", experimentContext.setup.name);

                rep.AddExtra("Experiment: " + experimentContext.setup.name);

                rep.AddExtra("Info: " + experimentContext.setup.description);

                rep.SetDescription("Structural report for all classes in the experiment");
                rep.GetReportAndSave(classReportFolder, appManager.AppInfo, "structural_class_report", true, experimentContext.tools.operation.doReportsInParalell);
            }

            classReportFolder.generateReadmeFiles(appManager.AppInfo);


            experimentContext.notes.log("Mining Context preprocessing done in [" + DateTime.Now.Subtract(startTime).TotalMinutes.ToString("F2") + "] minutes");
            return(caseKnowledgeSet);
        }
Beispiel #14
0
        public override void DoTraining(DocumentSetCaseCollectionSet trainingSet, classifierTools tools, ILogBuilder logger)
        {
            var state = states.SetState(trainingSet, GetExperimentSufix());


            if (activationFunction == null)
            {
                activationFunction = new BipolarSigmoidFunction(setup.neuralnetwork.alpha);
            }

            var neurons = setup.neuralnetwork.HiddenLayersNeuronCounts.ToList();

            ActivationNetwork machine = null;

            switch (neurons.Count)
            {
            case 0:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, state.data.NumberOfClasses);
                break;

            case 1:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], state.data.NumberOfClasses);
                break;

            case 2:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], state.data.NumberOfClasses);
                break;

            case 3:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], state.data.NumberOfClasses);
                break;

            case 4:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], state.data.NumberOfClasses);
                break;

            case 5:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], state.data.NumberOfClasses);
                break;

            case 6:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], neurons[5], state.data.NumberOfClasses);
                break;

            case 7:
                machine = new ActivationNetwork(new BipolarSigmoidFunction(setup.neuralnetwork.alpha), state.data.NumberOfInputs, neurons[0], neurons[1], neurons[2], neurons[3], neurons[4], neurons[5], neurons[6], state.data.NumberOfClasses);
                break;

            default:
                throw new aceGeneralException("At current implementation NN with [" + neurons.Count + "] hidden layers is not allowed.", null, this, "To high number of hidden layers");
                break;
            }

            new NguyenWidrow(machine).Randomize();
            state.machine = machine;

            // BackPropagationLearning teacher = new BackPropagationLearning(machine);
            LevenbergMarquardtLearning teacher = new LevenbergMarquardtLearning(machine);

            teacher.LearningRate = setup.neuralnetwork.learningRate;

            var outputs = Accord.Statistics.Tools.Expand(state.data.outputs, state.data.NumberOfClasses, -1, 1);
            //teacher.Momentum = momentum;
            Int32  itOfSameError        = 0;
            Int32  itOfSameErrorLimit   = setup.neuralnetwork.learningIterationsMax / 10;
            Double errorSignificantSpan = setup.neuralnetwork.errorLowerLimit * setup.neuralnetwork.errorLowerLimit;

            for (int i = 0; i < setup.neuralnetwork.learningIterationsMax; i++)
            {
                double error = teacher.RunEpoch(state.data.inputs, outputs);

                if (Math.Abs(error - state.errorRate) < errorSignificantSpan)
                {
                    itOfSameError++;
                }

                if (itOfSameError > itOfSameErrorLimit)
                {
                    logger.log("Stoping training in [" + i.ToString("D3") + "] because error rate had no significant change [" + errorSignificantSpan.ToString("F8") + "] in last [" + itOfSameError + "] iterations [" + error.ToString("F8") + "]");
                    break;
                }
                if (i % 10 == 0)
                {
                    logger.log("Learning Neural Network [" + i.ToString("D3") + "]  Error rate: " + error.ToString("F5"));
                }
                if (error < state.errorRate)
                {
                    state.errorRate = error;
                }
                if (error < setup.neuralnetwork.errorLowerLimit)
                {
                    break;
                }
            }
            if (teacherRef == null)
            {
                teacherRef = teacher;
            }
            state.SaveState();
        }
        /// <summary>
        /// Does the make knowledge.
        /// </summary>
        /// <param name="subjects">The subjects.</param>
        /// <param name="tools">The tools.</param>
        /// <param name="knowledge">The knowledge.</param>
        /// <param name="logger">The logger.</param>
        public override void DoMakeKnowledge(List <pipelineTaskMCSiteSubject> subjects, classifierTools tools, semanticFVExtractorKnowledge knowledge, ILogBuilder logger)
        {
            Boolean report = tools.DoReport;

            if (knowledge.doBuildTermTable)
            {
                knowledge.WLTableOfIndustryClass.Clear();
                knowledge.WLTableOfIndustryClass = termTableConstructor.process(tools.context.pipelineCollection.GetTokensForSites <IPipelineTaskSubject>(subjects), cnt_level.mcPage, knowledge.WLTableOfIndustryClass, tools.GetLemmaResource(), logger, subjects.Count == 1);
            }
            else
            {
                if (subjects.Count == 1)
                {
                    // logger.log("Using existing Web Lemma Table on [" + subjects.First().name + "]");
                }
            }

            if (knowledge.doBuildChunkTable)
            {
                if ((subjects.Count > 1) || SVMChunkSimilarity.isActive)
                {
                    if (semanticSimilarity.isActive || SVMChunkSimilarity.isActive || cosineSemanticSimilarity.isActive)
                    {
                        List <IPipelineTaskSubject> MCChunks = subjects.GetSubjectChildrenTokenType <IPipelineTaskSubject, IPipelineTaskSubject>(new cnt_level[] { cnt_level.mcChunk }, true); // sites.getAllChildren();  //context.exitSubjects.GetSubjectsOfLevel<IPipelineTaskSubject>(cnt_level.mcTokenStream);

                        if (!MCChunks.Any())
                        {
                            throw new aceScienceException("No chunks found from [" + subjects.Count + "] web sites", null, subjects, "FVE Chunk construction :: Pipeline context returned no chunks");
                        }

                        knowledge.WLChunkTableOfIndustryClass.Clear();
                        knowledge.WLChunkTableOfIndustryClass = chunkTableConstructor.process(MCChunks, cnt_level.mcPage, knowledge.WLChunkTableOfIndustryClass, null, logger, subjects.Count == 1);
                    }
                }
            }
            else
            {
            }

            if (knowledge.doBuildSemanticCloud)
            {
                if ((subjects.Count > 1))
                {
                    if (knowledge.WLChunkTableOfIndustryClass.Count > 0)
                    {
                        if (knowledge.semanticCloud.Any())
                        {
                            if (tools.operation.doUseExistingKnowledge)
                            {
                                logger.log(" ::: Rebuilding semantic cloud for [" + subjects.Count + "] subjects, despite the cloud already had [" + knowledge.semanticCloud.CountNodes() + "] nodes and doUseExistingKnowledge=true !! ");
                                logger.log(" ::: This is not proper behaviour --- seems the code has bugs :)");
                            }
                        }

                        knowledge.semanticCloud.Clear();
                        knowledge.semanticCloud      = CloudConstructor.process(knowledge.WLChunkTableOfIndustryClass, knowledge.WLTableOfIndustryClass, knowledge.semanticCloud, logger, subjects, tools.GetLemmaResource());
                        knowledge.semanticCloud.name = knowledge.name;

                        knowledge.semanticCloud.description = "Original semantic cloud, extracted from chunks";

                        if (tools.operation.doUseSimpleGraphs)
                        {
                            knowledge.semanticCloud.GetSimpleGraph(true).Save(knowledge.folder.pathFor("class_" + knowledge.semanticCloud.className + "_initialCloud", getWritableFileMode.overwrite));
                        }
                        else
                        {
                            var converter = lemmaSemanticCloud.GetDGMLConverter();
                            converter.ConvertToDMGL(knowledge.semanticCloud).Save(knowledge.folder.pathFor("class_" + knowledge.semanticCloud.className + "_initialCloud", getWritableFileMode.overwrite));
                        }

                        //if (tools.operation.doMakeGraphForClassClouds)
                        //{
                        //    var converter = lemmaSemanticCloud.GetDGMLConverter();
                        //    converter.ConvertToDMGL(knowledge.semanticCloud).Save(knowledge.folder.pathFor(knowledge.name + "_initialCloud", getWritableFileMode.overwrite, "Semantic cloud in initial state - before Cloud Matrix filter applied"));
                        //}

                        if (knowledge.semanticCloud.CountNodes() == 0)
                        {
                            throw new aceScienceException("Semantic cloud [" + knowledge.name + "] construction failed -- zero nodes produced!", null, knowledge, "Sound cloud construction failed", subjects);
                        }
                    }
                }
            }



            if (tools.DoReport)
            {
                //   knowledge.WLTableOfIndustryClass.GetDataTable().GetReportAndSave(knowledge.folder, appManager.AppInfo, "wfl_" + knowledge.name);
                //  knowledge.WLChunkTableOfIndustryClass.GetDataTable().GetReportAndSave(knowledge.folder, appManager.AppInfo, "wfl_" + knowledge.name + "_chunks");
            }
        }
        public override semanticFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
        {
            semanticFVExtractorKnowledge knowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(documentSetClass, vCaseColl.kFoldCase, logger);

            knowledge.SetRebuild(!tools.DoUseExistingKnowledge);


            if (knowledge.ShouldBuildAny())
            {
                DocumentSetCaseCollection dSetCol = new DocumentSetCaseCollection(documentSetClass);


                var context = tools.context.pipelineCollection.GetContext(tools, documentSetClass);

                //var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ConvertList<IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();
                var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ToList();
                List <pipelineTaskMCSiteSubject> ISites = sites.ConvertList <IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();

                List <pipelineTaskMCSiteSubject> fSites = vCaseColl.FilterSites(ISites);


                dSetCol.deploy(vCaseColl, validationCase, fSites, classes);

                List <webLemmaTermTable> tables = new List <webLemmaTermTable>();
                //List<webLemmaTermTable> chunkTables = new List<webLemmaTermTable>();



                foreach (DocumentSetCase vc in dSetCol)
                {
                    semanticFVExtractorKnowledge cKnowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(vc, validationCase, logger);
                    DoMakeKnowledgeForCase(vc, tools, dSetCol, logger);
                    tables.Add(cKnowledge.WLTableOfIndustryClass);
                }

                var tbl = tables.GetMergedLemmaTable(knowledge.name, logger);
                termTableConstructor.recompute(knowledge.WLTableOfIndustryClass, logger, false, tbl.GetList());



                DoMakeKnowledge(fSites, tools, knowledge, logger);
            }

            //  SetKnowledge(knowledge);
            //knowledge.OnBeforeSave();

            logger.log("[ALTPROC] Feature Extraction by [" + name + "][" + vCaseColl.kFoldCase.name + "][" + documentSetClass.name + "] done for " + vCaseColl.className);

            return(knowledge);
        }