Exemplo n.º 1
0
        public override semanticFVExtractorKnowledge DoFVExtractionForClassViaCases(validationCaseCollection vCaseColl, IDocumentSetClass documentSetClass, kFoldValidationCase validationCase, classifierTools tools, ILogBuilder logger)
        {
            semanticFVExtractorKnowledge knowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(documentSetClass, vCaseColl.kFoldCase, logger);

            knowledge.SetRebuild(!tools.DoUseExistingKnowledge);


            if (knowledge.ShouldBuildAny())
            {
                DocumentSetCaseCollection dSetCol = new DocumentSetCaseCollection(documentSetClass);


                var context = tools.context.pipelineCollection.GetContext(tools, documentSetClass);

                //var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ConvertList<IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();
                var sites = context.exitByType[typeof(pipelineTaskMCSiteSubject)].ToList();
                List <pipelineTaskMCSiteSubject> ISites = sites.ConvertList <IPipelineTaskSubject, pipelineTaskMCSiteSubject>().ToList();

                List <pipelineTaskMCSiteSubject> fSites = vCaseColl.FilterSites(ISites);


                dSetCol.deploy(vCaseColl, validationCase, fSites, classes);

                List <webLemmaTermTable> tables = new List <webLemmaTermTable>();
                //List<webLemmaTermTable> chunkTables = new List<webLemmaTermTable>();



                foreach (DocumentSetCase vc in dSetCol)
                {
                    semanticFVExtractorKnowledge cKnowledge = vCaseColl.kFoldMaster.knowledgeLibrary.GetKnowledgeInstance <semanticFVExtractorKnowledge>(vc, validationCase, logger);
                    DoMakeKnowledgeForCase(vc, tools, dSetCol, logger);
                    tables.Add(cKnowledge.WLTableOfIndustryClass);
                }

                var tbl = tables.GetMergedLemmaTable(knowledge.name, logger);
                termTableConstructor.recompute(knowledge.WLTableOfIndustryClass, logger, false, tbl.GetList());



                DoMakeKnowledge(fSites, tools, knowledge, logger);
            }

            //  SetKnowledge(knowledge);
            //knowledge.OnBeforeSave();

            logger.log("[ALTPROC] Feature Extraction by [" + name + "][" + vCaseColl.kFoldCase.name + "][" + documentSetClass.name + "] done for " + vCaseColl.className);

            return(knowledge);
        }
Exemplo n.º 2
0
        public static DataTable BuildShema(this DocumentSetCaseCollection host, Boolean isSingleCategoryReport = true, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true)
        {
            var setClass       = host.setClass;
            var validationCase = host.validationCase;

            String tableName = "";

            if (isSingleCategoryReport)
            {
                tableName = (setClass.name + validationCase.name).getCleanFilepath();
            }
            else if (isTrainingCollection)
            {
                tableName = (setClass.name + validationCase.name + "_training").getCleanFilepath();
            }
            else
            {
                tableName = validationCase.name + "full".getCleanFilePath();
            }

            DataTable output = new DataTable(tableName);



            if (!isSingleCategoryReport)
            {
                output.Add("Origin", "Name of the origin class", "", typeof(String), imbSCI.Core.enums.dataPointImportance.normal, "", "Origin").SetWidth(25).SetGroup("Case").SetDefaultBackground(Color.OrangeRed);
            }

            output.Add("Case", "Name of the case evaluated", "C_n", typeof(String), imbSCI.Core.enums.dataPointImportance.normal, "", "Case name").SetWidth(25).SetGroup("Case").SetDefaultBackground(Color.OrangeRed);



            if (!isTrainingCollection)
            {
                output.Add("Correct", "Number of classifiers that classified this case correctly. Web sites with zero or very low values accross multiple experiments are probably problem in the training set", "", typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "P1", "Mean Success Rate").SetUnit("%").SetGroup("Case Control");


                foreach (var cl in validationCase.context.setup.classifiers)
                {
                    output.Add("EvalTrue" + cl.name, "If classification result is correct", "C_" + cl.name, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "True").SetGroup(cl.name).SetUnit("Tp").SetWidth(7);
                    output.Add("ClassResultName" + cl.name, "Name of class associated by [" + cl.GetExperimentSufix() + "][" + cl.GetType().Name + "] classifier", "R_" + cl.name, typeof(String), imbSCI.Core.enums.dataPointImportance.normal, "", "Class name").SetGroup(cl.name).SetWidth(14);
                }
            }

            foreach (var pair in setClass.parent.GetClasses())
            {
                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        output.Add(fv.name + "_" + pair.treeLetterAcronim, fv.description + " - for " + pair.name, fv.name + "_" + pair.classID, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " for " + pair.name).SetGroup("FEATURE VECTORS");
                    }
                    // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
                }
            }

            if (doFVAnalysis)
            {
                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        output.Add("FVRange" + fv.name, "Standard deviation of values in the similarity vector [" + fv.name + "] for this row", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " S.Dev.").SetGroup("FVE Control metrics").SetDefaultBackground("#FF22639a");
                        output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FVE Control metrics").SetDefaultBackground("#FF22639a");
                    }
                    // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
                }


                //output.Add("FVRange", "Max - Min value range - of FVE values for each category", "C_" + cl.name, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "True").SetGroup(cl.name).SetUnit("Tp").SetWidth(7);
            }


            output.Add("name", "UID name for the row", "id", typeof(String), imbSCI.Core.enums.dataPointImportance.normal, "", "Name").SetWidth(50).SetGroup("Case");
            return(output);
        }
Exemplo n.º 3
0
        public static DataRow BuildRow(this DocumentSetCaseCollection host, DocumentSetCase setCase, DataTable output, Boolean isTrainingCollection = false, Boolean doFVAnalysis = true)
        {
            var setClass       = host.setClass;
            var validationCase = host.validationCase;

            DataRow dr = output.NewRow();

            dr["name"] = host.validationCase.name + "_" + setCase.subject.name;

            if (output.Columns.Contains("Origin"))
            {
                dr["Origin"] = host.setClass.name;
            }


            dr["Case"] = setCase.subject.name;

            if (!isTrainingCollection)
            {
                Int32 cor = 0;
                foreach (var cl in validationCase.context.setup.classifiers)
                {
                    String cName = "";
                    Int32  t     = 0;
                    if (setCase.data[cl].selected != null)
                    {
                        cName = setCase.data[cl].selected.name;
                        if (setCase.data[cl].selected.classID == host.rightClassID)
                        {
                            t = 1;
                        }
                        else
                        {
                            t = 0;
                        }
                    }
                    else
                    {
                        cName = "- not set -";
                    }
                    dr["ClassResultName" + cl.name] = cName;

                    cor += t;

                    dr["EvalTrue" + cl.name] = t;
                }

                dr["Correct"] = cor.GetRatio(validationCase.context.setup.classifiers.Count);
            }

            foreach (var cl in setCase.data.setClassCollection.GetClasses())
            {
                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr[fv.name + "_" + cl.treeLetterAcronim] = setCase.data.featureVectors[cl.classID][fv];
                    }
                }
            }



            if (doFVAnalysis)
            {
                // aceDictionary2D<String, String, rangeFinder> matrix = new aceDictionary2D<string, string, rangeFinder>();

                Dictionary <String, rangeFinderWithData> rangers = new Dictionary <string, rangeFinderWithData>();

                foreach (var cl in setCase.data.setClassCollection.GetClasses())
                {
                    foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                    {
                        if (fv.isActive)
                        {
                            if (!rangers.ContainsKey(fv.name))
                            {
                                rangers.Add(fv.name, new rangeFinderWithData(fv.name));
                            }

                            rangers[fv.name].Learn(setCase.data.featureVectors[cl.classID][fv]);
                        }
                    }
                }



                foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
                {
                    if (fv.isActive)
                    {
                        dr["FVRange" + fv.name]   = rangers[fv.name].doubleEntries.GetStdDeviation(false);
                        dr["CFV_Ratio" + fv.name] = rangers[fv.name].GetPositionInRange(setCase.data.featureVectors[setClass.classID][fv]);
                        // output.Add("CFV_Ratio" + fv.name, "Value ratio indicating the position of correct category FV, within the range", fv.name, typeof(Double), imbSCI.Core.enums.dataPointImportance.normal, "F5", fv.name + " Range Position").SetGroup("FV Metrics");
                    }
                    // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
                }
            }



            output.Rows.Add(dr);
            return(dr);
        }
Exemplo n.º 4
0
        public static void SetAdditionalInfo(this DocumentSetCaseCollection host, DataTable output, Boolean isSingleCategoryReport = true, Boolean isTrainingCollection = false)
        {
            var setClass       = host.setClass;
            var validationCase = host.validationCase;

            if (isSingleCategoryReport)
            {
                output.SetAdditionalInfoEntry("Category", setClass.name);
            }

            output.SetAdditionalInfoEntry("Feature extractor", validationCase.extractor.name);
            output.SetAdditionalInfoEntry("Training cases", validationCase.trainingCases.First().Count());
            output.SetAdditionalInfoEntry("Eval cases", validationCase.evaluationCases.First().Count());
            output.SetAdditionalInfoEntry("Fold case", validationCase.name);


            if (isSingleCategoryReport)
            {
                output.SetDescription("Results of [" + validationCase.name + "] fold, for class[" + setClass.name + "]");
                output.SetTitle(setClass.name + " in " + validationCase.name);
                if (isSingleCategoryReport)
                {
                    output.AddExtra("Test sample subset :: Case-level Feature Extraction data on [" + host.setClass.name + "] class.");
                }
                else
                {
                    output.AddExtra("Test sample subset :: Case-level cassification results and feature Extraction data on [" + host.setClass.name + "] class [" + validationCase.name + "]");
                }
            }
            else if (isTrainingCollection)
            {
                output.SetTitle(validationCase.name + " training inputs");
                output.SetDescription("Feature Vectors used for [" + validationCase.name + "] fold, class[" + setClass.name + "]");
                if (isSingleCategoryReport)
                {
                    output.AddExtra("Training sample subset :: Case-level Feature Extraction data on [" + host.setClass.name + "] class.");
                }
                else
                {
                    output.AddExtra("Training sample subset :: Case-level cassification results and feature Extraction data on [" + host.setClass.name + "] class [" + validationCase.name + "]");
                }
            }
            else
            {
                output.SetTitle(validationCase.name + " - all classes");
                output.SetDescription("Results of [" + validationCase.name + "] fold - all classes");
            }

            foreach (var fv in validationCase.extractor.settings.featureVectors.serialization)
            {
                if (!fv.isActive)
                {
                    output.SetAdditionalInfoEntries("FV " + fv.name, "is not active");
                }


                // output.Add("Terms_" + pair.treeLetterAcronim, "If classification was true", "M_" + pair.classID, typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal, "", "Matched for " + pair.name).SetGroup("FEATURE VECTORS");
            }



            output.SetAggregationAspect(imbSCI.Core.math.aggregation.dataPointAggregationAspect.subSetOfRows);
        }
Exemplo n.º 5
0
        protected void runModel(experimentExecutionContext context, IWebFVExtractor model)
        {
            imbSCI.Core.screenOutputControl.logToConsoleControl.setAsOutput(context.logger, model.name);
            Int32 crashRetries = context.tools.operation.doRebootFVEOnCrashRetryLimit;
            aceDictionarySet <IDocumentSetClass, DocumentSetCaseCollection> casesByClasses = new aceDictionarySet <IDocumentSetClass, DocumentSetCaseCollection>();
            DSCCReportSet kFoldReport = new DSCCReportSet(model);
            var           valCol      = context.validationCollections[model.name];

            List <DocumentSetCaseCollectionSet> modelCaseResults = new List <DocumentSetCaseCollectionSet>();

            crashRetries = context.tools.operation.doRebootFVEOnCrashRetryLimit;
            while (crashRetries > 0)
            {
                try
                {
                    experimentNotes modelNotes = new experimentNotes(valCol.folder, "Fold-level experiment settings description notes");
                    modelNotes.AppendLine("# Notes on Feature Vector Extractor: " + model.name);

                    var nts = model.DescribeSelf();
                    nts.ForEach(x => modelNotes.AppendLine(x));



                    context.logger.log("Executing k-fold cases with model [" + model.name + "]");



                    valCol.DescribeSampleDistribution(modelNotes);

                    context.mainReport.valColVsModelVsSampleHash.Add("[" + model.name + "]".toWidthExact(20) + " [sample distribution hash: " + valCol.SampleDistributionHash + "]");

                    modelNotes.SaveNote();

                    ParallelOptions ops = new ParallelOptions();
                    ops.MaxDegreeOfParallelism = context.tools.operation.ParallelThreads;

                    Parallel.ForEach <kFoldValidationCase>(valCol.GetCases(), ops, valCase =>
                    {
                        model.DoFVEAndTraining(valCase, context.tools, context.logger); // <---------------------------------------------------------------------------------------   BUILDING FVE

                        DocumentSetCaseCollectionSet results = model.DoClassification(valCase, context.tools, context.logger);

                        if (!results.Any())
                        {
                            throw new aceScienceException("DoClassification for [" + model.name + "] returned no results!", null, model, "DoClassification " + model.name + " failed!", context);
                        }

                        foreach (var pair in results)
                        {
                            DocumentSetCaseCollection cls = pair.Value;
                            casesByClasses.Add(cls.setClass, cls);
                        }

                        valCase.evaluationResults = results;

                        if (context.tools.DoResultReporting)
                        {
                            context.logger.log("producing reports on k-Fold case [" + valCase.name + "]");
                            DSCCReports r = results.GetReports();

                            var sumMeans = r.GetAverageTable(context); //.GetReportAndSave(valCase.folder, appManager.AppInfo, "CrossValidation_" + valCase.name);
                            sumMeans.SetDescription("FVE report, aggregated for all categories - for fold [" + valCase.name + "]");


                            sumMeans.GetReportAndSave(valCase.folder, appManager.AppInfo, "CrossValidation_" + valCase.name, true, context.tools.operation.doReportsInParalell);

                            var fveAndCase = r.GetFullValidationTable(context);
                            fveAndCase.SetDescription("Per-category aggregate statistics, for each classifier, within fold [" + valCase.name + "], used for macro-averaging");
                            fveAndCase.GetReportAndSave(valCase.folder, appManager.AppInfo, "CrossValidation_extrainfo_" + valCase.name, true, context.tools.operation.doReportsInParalell);

                            var fullCaseReport = results.GetReportOnAllCases();


                            fullCaseReport.GetReportAndSave(valCase.folder, appManager.AppInfo, "FullReport_" + valCase.name, true, context.tools.operation.doReportsInParalell);

                            kFoldReport.Add(valCase, r);
                        }

                        context.logger.log("k-Fold case [" + valCase.name + "] completed");

                        context.notes.log("- - Experiment sequence for [" + valCase.name + "] fold completed");
                        if (context.tools.operation.doSaveKnowledgeForClasses)
                        {
                            valCase.knowledgeLibrary.SaveKnowledgeInstancesForClasses(valCase, context.logger);
                        }
                    });

                    foreach (var fold in valCol.GetCases()) //  Parallel.ForEach<kFoldValidationCase>(valCol.GetCases(), ops, valCase =>
                    {
                        modelCaseResults.Add(fold.evaluationResults);
                    }

                    crashRetries = 0;
                }
                catch (Exception ex)
                {
                    crashRetries--;
                    context.errorNotes.LogException("FVE Model crashed -- retries left [" + crashRetries + "] --- ", ex, model.name);
                    context.logger.log(":::: REPEATING the model [" + model.name + "] ::: CRASHED [" + ex.Message + "] ::: RETRIES [" + crashRetries + "]");
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 1000, 1);
                    imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(2400, 1000, 1);
                    imbSCI.Core.screenOutputControl.logToConsoleControl.setAsOutput(context.logger, "RETRIES[" + crashRetries + "]");
                }
            }


            imbSCI.Core.screenOutputControl.logToConsoleControl.setAsOutput(context.logger, "Reporting");


            valCol.knowledgeLibrary.SaveCaseKnowledgeInstances(context.logger);

            // DocumentSetCaseCollection second = null;
            if (modelCaseResults.Any())
            {
                featureExtractionMetrics modelMetrics = new featureExtractionMetrics(model.name, "All");
                DataTableTypeExtended <featureExtractionMetrics> modelVsCategoryMetrics = new DataTableTypeExtended <featureExtractionMetrics>(model.name, "Model metrics per category");


                // <-------------------------------------- CATEGORIES REPORT ----------------------------------------------

                DataTable allTable = modelCaseResults.First()[0].GetReportTable(false, false).GetClonedShema <DataTable>();; //valCol.GetCases().First().evaluationResults[0].GetReportTable(false, false);


                rangeFinderForDataTable ranger = new rangeFinderForDataTable(allTable, "name");
                ranger.columnsToSignIn.Add("Case");

                foreach (KeyValuePair <IDocumentSetClass, aceConcurrentBag <DocumentSetCaseCollection> > pair in casesByClasses)
                {
                    DocumentSetCaseCollection first = null;
                    DataTable repTable = null;

                    ranger.prepareForNextAggregationBlock(allTable, "name");

                    foreach (DocumentSetCaseCollection cn in pair.Value)
                    {
                        foreach (var cni in cn)
                        {
                            if (cni != null)
                            {
                                cn.BuildRow(cni, allTable, false);
                            }
                        }
                    }

                    ranger.AddRangeRows(pair.Key.name, allTable, true, imbSCI.Core.math.aggregation.dataPointAggregationType.avg | imbSCI.Core.math.aggregation.dataPointAggregationType.stdev);

                    var categoryMetrics = new featureExtractionMetrics(model.name, pair.Key.name);
                    categoryMetrics.SetValues(ranger);

                    modelVsCategoryMetrics.AddRow(categoryMetrics);
                    modelMetrics.AddValues(categoryMetrics);

                    categoryMetrics.saveObjectToXML(valCol.folder.pathFor(model.name + "_" + categoryMetrics.Name + ".xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "FV and Category sample metrics, serialized object"));
                    //context.notes.log("- - Creating report for category [" + pair.Key.name + "] completed");
                    //repTable.GetReportAndSave(valCol.folder, appManager.AppInfo, model.name + "_category_" + pair.Key.name);
                }

                modelMetrics.DivideValues(casesByClasses.Count);
                modelMetrics.saveObjectToXML(valCol.folder.pathFor(model.name + "_metrics.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Cross-categories macroaveraged metrics of the FVE model [" + model.name + "]"));

                modelVsCategoryMetrics.AddRow(modelMetrics);
                modelVsCategoryMetrics.GetRowMetaSet().SetStyleForRowsWithValue <String>(DataRowInReportTypeEnum.dataHighlightA, "Name", modelMetrics.Name);
                modelVsCategoryMetrics.GetReportAndSave(valCol.folder, appManager.AppInfo, model.name + "_metrics", true, true);

                context.mainReport.AddModelMetrics(modelMetrics);


                context.notes.log("- Creating report for all categories [" + model.name + "] ");
                allTable.GetReportAndSave(valCol.folder, appManager.AppInfo, model.name + "_categories", true, context.tools.operation.doReportsInParalell);
            }



            kFoldReport.MakeReports(context, valCol.folder);
            context.mainReport.AddBestPerformer(kFoldReport.GetTopClassifierReport(), kFoldReport.meanClassifierReport, model);

            // <---------------- creation of complete report

            context.notes.log("- Experiment sequence with Feature Vector Extractor [" + model.name + "] completed");
            context.notes.SaveNote();

            // <------------- END OF THE MODEL -------------------------------------------------------------------------------------------------
        }