示例#1
0
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            //if (generalContext == null)
            //{
            //    generalContext = new PlanesMethodContext();
            //}
            IEntityPlaneContext entityInputContext = inputContext as IEntityPlaneContext;


            ICorpusPlaneContext entityContext = EntityMethod.ExecutePlaneMethod(inputContext, generalContext, logger) as ICorpusPlaneContext;

            IVectorPlaneContext corpusContext = CorpusMethod.ExecutePlaneMethod(entityContext, generalContext, logger) as IVectorPlaneContext;

            IFeaturePlaneContext vectorContext = VectorMethod.ExecutePlaneMethod(corpusContext, generalContext, logger) as IFeaturePlaneContext;

            IFeaturePlaneContext featureContext = FeatureMethod.ExecutePlaneMethod(vectorContext, generalContext, logger) as IFeaturePlaneContext;

            // --- the results reporting

            var evaluationMetrics = generalContext.truthTable.EvaluateTestResultsToMetricSet(featureContext.testResults, generalContext.runName + "-" + notes.folder.name, logger);

            DataTableTypeExtended <classificationEval> inclassEvalTable = new DataTableTypeExtended <classificationEval>("inclass_evaluation", "Test results, per class");

            evaluationMetrics.GetAllEntries().ForEach(x => inclassEvalTable.AddRow(x));
            inclassEvalTable.AddRow(evaluationMetrics.GetSummary("Sum"));
            notes.SaveDataTable(inclassEvalTable, notes.folder_classification);

            classificationReport averagedReport = new classificationReport(evaluationMetrics, generalContext.averagingMethod);

            averagedReport.Classifier = FeatureMethod.classifier.name;
            averagedReport.saveObjectToXML(notes.folder_classification.pathFor(averagedReport.Name + ".xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized classification evaluation results summary"));

            generalContext.testSummaries.Add(averagedReport);

            averagedReport.ReportToLog(notes);

            featureContext.provider.Dispose();
            EntityMethod.CacheProvider.Dispose();

            return(generalContext);
        }
        /// <summary>
        /// Generates feature vectors
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[3] Vector Plane - execution", "");

            IVectorPlaneContext context      = (IVectorPlaneContext)inputContext;
            FeaturePlaneContext finalContext = new FeaturePlaneContext();

            finalContext.provider.StoreAndReceive(context);

            ICorpusPlaneContext corpusContext = finalContext.provider.GetContext <CorpusPlaneContext>();

            // deploying feature vector space constructor
            featureSpaceConstructor.Deploy(constructorSettings, context.vectorSpace);
            featureSpaceConstructor.Deploy(constructorSettings, corpusContext.SelectedFeatures);

            Dictionary <string, FeatureVector> docByName = new Dictionary <string, FeatureVector>();

            notes.log(":: Constructing feature vectors");
            // constructing the feature vectors
            foreach (IVector vector in context.vectorSpace.documents)
            {
                var fv = featureSpaceConstructor.ConstructFeatureVector(vector);
                docByName.Add(fv.name, fv);
                finalContext.featureSpace.documents.Add(fv);
            }

            foreach (var link in context.LabelToDocumentLinks.links)
            {
                finalContext.featureSpace.labelToDocumentAssociations.Add(docByName[link.NodeB.name], link.NodeA, 1);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_featureVectors))
            {
                var dt = finalContext.featureSpace.MakeTable(featureSpaceConstructor, "FeatureSpace", "Feature space");
                notes.SaveDataTable(dt, notes.folder_feature);
            }

            notes.logEndPhase();

            return(finalContext);
        }
示例#3
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[2] Corpus Plane - execution", "");

            ICorpusPlaneContext context       = (ICorpusPlaneContext)inputContext;
            VectorPlaneContext  outputContext = new VectorPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            context.stemmContext = new StemmingContext(stemmer);

            Dictionary <String, SpaceDocumentModel> documentVsModel = new Dictionary <string, SpaceDocumentModel>();

            // modelling the documents
            foreach (TextDocument doc in context.corpus_documents)
            {
                SpaceDocumentModel model          = spaceConstructor.ConstructDocument(doc.content, doc.name, context.space, context.stemmContext, tokenizer);
                List <SpaceLabel>  labels         = spaceConstructor.GetLabels(doc.labels, context.space);
                Boolean            isUnknownLabel = true;
                foreach (SpaceLabel label in labels)
                {
                    if (label.name != SpaceLabel.UNKNOWN)
                    {
                        isUnknownLabel = false;
                    }
                    context.space.LabelToDocumentLinks.Add(label, model, 1);
                }
                context.space.documents.Add(model);
                if (!isUnknownLabel)
                {
                    context.space.terms.MergeDictionary(model.terms);
                }

                documentVsModel.Add(doc.name, model);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_textrender))
            {
                foreach (TextDocument doc in context.corpus_documents)
                {
                    String prefix = doc.labels.FirstOrDefault();
                    if (prefix.isNullOrEmpty())
                    {
                        prefix = SpaceLabel.UNKNOWN;
                    }

                    String fn  = prefix + "_" + doc.name;
                    String pth = notes.folder_entity.pathFor(fn.getFilename("txt"), imbSCI.Data.enums.getWritableFileMode.overwrite, "Textual representation of website [" + doc.name + "], produced by rendering and blending settings", true);
                    doc.content.saveStringToFile(pth, imbSCI.Data.enums.getWritableFileMode.overwrite);
                }
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_stats))
            {
                foreach (WebSiteDocumentsSet ds in context.dataset)
                {
                    DataTable dt = ds.MakeTable(documentVsModel);
                    notes.SaveDataTable(dt, notes.folder_entity);
                }



                var dt_vsm = context.space.LabelToDocumentLinks.MakeTable("LabelToDocument", "Relationships between labels and documents in the primary Vector Space Model");
                notes.SaveDataTable(dt_vsm, notes.folder_corpus);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_corpusDictionary))
            {
                notes.SaveDataTable(context.space.terms.MakeTable("corpus_stats", "Training set dictionary, after stemming", generalContext.DictionaryReportLimit), notes.folder_corpus);
            }



            #region SELECTING THE FEATURES
            // forming corpus global weight
            context.SelectedFeatures = new WeightDictionary();
            List <KeyValuePair <string, double> > filter_result = filter.SelectFeatures(context.space);
            List <string> FV = new List <string>();
            FV.AddRange(filter_result.Select(x => x.Key));

            if (filter_result.Any())
            {
                foreach (var pair in filter_result)
                {
                    context.SelectedFeatures.AddEntry(pair.Key, pair.Value);
                }

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_selectedFeatures))
                {
                    notes.SaveDataTable(context.SelectedFeatures.MakeTable("selected_features", "Features selected for BoW construction", new List <string>()
                    {
                        filter.function.shortName
                    }, generalContext.DictionaryReportLimit), notes.folder_corpus);
                }
            }
            else
            {
                logger.log("-- Feature selection function returned zero set. All features [" + context.space.terms.Count + "] are therefore accepted as selected.");
                var tkns = context.space.terms.GetTokens();
                foreach (var tkn in tkns)
                {
                    context.SelectedFeatures.AddEntry(tkn, 1);
                }
            }
            #endregion


            notes.log("Selected features [" + context.SelectedFeatures.entries.Count + "] by [" + filter.functionSettings.functionName + "]");



            //context.space =
            //weightModel.Deploy();

            outputContext.vectorSpace = new Vectors.VectorSpace();


            foreach (SpaceLabel label in context.space.labels)
            {
                var docs = context.space.LabelToDocumentLinks.GetAllLinked(label);
                if (label.name != SpaceLabel.UNKNOWN)
                {
                    SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs);
                    context.space.LabelToCategoryLinks.Add(label, categoryModel, 1);

                    context.space.categories.Add(categoryModel);

                    notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] ");
                }
            }

            outputContext.LabelToDocumentLinks = context.space.LabelToDocumentLinks;

            // preparing the model
            weightModel.PrepareTheModel(context.space);

            // logger.log(":: Creating VectorSpace instances for documents");
            // building document VSM
            foreach (SpaceDocumentModel docModel in context.space.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.space);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("docVec_" + docModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
                outputContext.vectorSpace.documents.Add(docVec);
            }

            // logger.log(":: Creating VectorSpace instances for categories");
            // building category VSM
            foreach (SpaceCategoryModel catModel in context.space.categories)
            {
                var         wd     = weightModel.GetWeights(FV, catModel, context.space);
                VectorLabel catVec = new VectorLabel(catModel.name);
                catVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("catVec_" + catModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }

                outputContext.vectorSpace.labels.Add(catVec);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
            {
                foreach (SpaceCategoryModel catModel in context.space.categories)
                {
                    var dt = catModel.terms.MakeTable("cat_" + catModel.name, "Vector Space BoW weighted model, representing a category");
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
            }


            notes.logEndPhase();

            return(outputContext);
        }
示例#4
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[4] Feature Plane - execution", "");

            IFeaturePlaneContext context = inputContext as IFeaturePlaneContext;



            foreach (FeatureVector vec in context.featureSpace.documents)
            {
                var associated = context.featureSpace.labelToDocumentAssociations.GetAllLinked(vec);
                if (associated.Any())
                {
                    Int32 lbi = generalContext.truthTable.labels_without_unknown.IndexOf(associated.First().name);

                    FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(vec, lbi);
                    context.trainingSet.Add(fvl);
                }
                else
                {
                    context.testSet.Add(vec);
                }
            }

            if (!context.testSet.Any())
            {
                notes.log("TEST SET IS EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same");

                context.trainingSet.ForEach(x => context.testSet.Add(x.vector));
            }


            if ((!context.trainingSet.Any()))
            {
                notes.log("TRAINING SET EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same");
            }
            else
            {
                notes.log("Training [" + classifier.name + "] with [" + context.trainingSet.Count + "] feature vectors.");
                classifier.DoTraining(context.trainingSet, logger);

                notes.log("Testing [" + classifier.name + "] with [" + context.testSet.Count + "] feature vectors.");

                context.testResults = new List <FeatureVectorWithLabelID>();
                foreach (FeatureVector fv in context.testSet)
                {
                    Int32 result = classifier.DoSelect(fv, logger);
                    FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(fv, result);
                    context.testResults.Add(fvl);
                }



                /*
                 * Dictionary<string, List<FeatureVectorWithLabelID>> byCategory = generalContext.truthTable.GroupByTrueCategory(context.testResults);
                 * objectTable<classificationReport> tbl = new objectTable<classificationReport>(nameof(classificationReport.Name), "inclass_" + generalContext.runName);
                 * classificationReport macroAverage = new classificationReport("AVG-" + generalContext.runName);
                 * foreach (KeyValuePair<string, List<FeatureVectorWithLabelID>> pair in byCategory)
                 * {
                 *  var cReport = generalContext.EvaluateTestResults(pair.Value, pair.Key + "-" + generalContext.runName, logger);
                 *
                 *  cReport.Classifier = classifier.name;
                 *  cReport.Comment = "Tr/Ts [" + context.trainingSet.Count + "]:[" + context.testSet.Count + "]";
                 *  String path = notes.folder_classification.pathFor(pair.Key + "_result.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized evaluation result within category [" + pair.Key + "]", true);
                 *
                 *  macroAverage.AddValues(cReport);
                 *
                 *  tbl.Add(cReport);
                 * }
                 * //  macroAverage.DivideValues(byCategory.Keys.Count);
                 *
                 * tbl.Add(macroAverage);
                 *
                 * notes.SaveDataTable(tbl.GetDataTable(), notes.folder_classification);
                 */
            }



            notes.logEndPhase();

            return(context);
        }
示例#5
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            if (notes != null)
            {
                notes.logStartPhase("[1] Entity Plane - execution", "");
            }

            IEntityPlaneContext context       = inputContext as IEntityPlaneContext;
            CorpusPlaneContext  outputContext = new CorpusPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            outputContext.dataset = context.dataset;

            // ---------------- rendering procedure
            Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> > renderIndex = new Dictionary <WebSiteDocumentsSet, List <TextDocumentSet> >();
            Dictionary <string, SpaceLabel> labels = new Dictionary <string, SpaceLabel>();

            Dictionary <WebSiteDocuments, TextDocumentSet>    sitesToRenders    = new Dictionary <WebSiteDocuments, TextDocumentSet>();
            Dictionary <String, WebSiteDocuments>             inputSites        = new Dictionary <string, WebSiteDocuments>();
            Dictionary <String, TextDocumentSet>              inputTextRenders  = new Dictionary <string, TextDocumentSet>();
            Dictionary <WebSiteDocuments, List <SpaceLabel> > inputSiteVsLabels = new Dictionary <WebSiteDocuments, List <SpaceLabel> >();

            Int32 c = 0;

            // rendering
            foreach (WebSiteDocumentsSet docSet in context.dataset)
            {
                if (docSet.name.isNullOrEmpty() || docSet.name == SpaceLabel.UNKNOWN)
                {
                    outputContext.space.label_unknown = new SpaceLabel(SpaceLabel.UNKNOWN);
                    labels.Add(SpaceLabel.UNKNOWN, outputContext.space.label_unknown);
                }
                else
                {
                    SpaceLabel lab = new SpaceLabel(docSet.name);
                    labels.Add(lab.name, lab);
                    outputContext.space.labels.Add(lab);
                }

                String datasetSignature = context.dataset.GetDataSetSignature();

                // ---- render
                List <TextDocumentSet> textSetForLabel = new List <TextDocumentSet>();

                if (CacheProvider.IsReady)
                {
                    foreach (WebSiteDocuments site in docSet)
                    {
                        TextDocumentSet tds = CacheProvider.GetCached <TextDocumentSet>(setupSignature, datasetSignature, site.domain);

                        if (tds == null)
                        {
                            tds = render.RenderSiteDocuments(site, logger);
                            CacheProvider.SetCached(setupSignature, datasetSignature, tds.name, tds);
                        }
                        else
                        {
                            tds.name = site.domain;
                        }


                        textSetForLabel.Add(tds);
                    }
                }
                else
                {
                    textSetForLabel = render.RenderDocumentSet(docSet, logger);
                    foreach (TextDocumentSet ws in textSetForLabel)
                    {
                        CacheProvider.SetCached(setupSignature, datasetSignature, ws.name, ws);
                    }
                }

                // // <--- performs the rendering

                textSetForLabel.ForEach(x => inputTextRenders.Add(x.name, x));
                // --- rest of indexing

                docSet.ForEach(x => inputSites.Add(x.domain, x));
                renderIndex.Add(docSet, textSetForLabel);


                foreach (WebSiteDocuments site in docSet)
                {
                    inputSiteVsLabels.Add(site, new List <SpaceLabel>());
                    inputSiteVsLabels[site].Add(labels[docSet.name]);
                    c++;
                }
            }

            if (notes != null)
            {
                notes.log("Text document for [" + c + "] entities created");
            }

            // tmp index
            foreach (String key in inputSites.Keys)
            {
                sitesToRenders.Add(inputSites[key], inputTextRenders[key]);
            }

            // page in site filtering
            if (filter.IsEnabled)
            {
                Dictionary <WebSiteDocuments, TextDocumentSet> renderIndexFiltered = new Dictionary <WebSiteDocuments, TextDocumentSet>();

                filter.Learn(inputTextRenders.Values);

                foreach (KeyValuePair <WebSiteDocuments, TextDocumentSet> pair in sitesToRenders)
                {
                    renderIndexFiltered.Add(pair.Key, filter.FilterDocumentSet(pair.Value));
                }
                sitesToRenders = renderIndexFiltered;
            }


            Dictionary <String, TextDocumentSet> TextDocumentsByDomainName = new Dictionary <string, TextDocumentSet>();

            foreach (var pair in sitesToRenders)
            {
                TextDocumentsByDomainName.Add(pair.Key.domain, pair.Value);
            }



            // blending pages into single page per web site
            //  DoBlendPagesIntoSingleEntity = blender.options.HasFlag(DocumentBlenderFunctionOptions.separatePages);

            Boolean keepSeparated = blender.DoKeepPagesSeparated;

            foreach (var pair in renderIndex)
            {
                foreach (TextDocumentSet entitySet in pair.Value)
                {
                    TextDocumentSet      selectedTexts = TextDocumentsByDomainName[entitySet.name];
                    WebSiteDocuments     web           = inputSites[entitySet.name];
                    IEnumerable <string> label         = inputSiteVsLabels[web].Select(x => x.name);

                    if (keepSeparated)
                    {
                        // filter function
                        TextDocument doc = blender.blendToTextDocument(selectedTexts);
                        doc.labels.AddRange(label);
                        outputContext.corpus_documents.Add(doc);
                    }
                    else
                    {
                        var docs = blender.blendToSeparateTextDocuments(selectedTexts); //blender.blendToTextDocument(selectedTexts);
                        foreach (TextDocument doc in docs)
                        {
                            doc.labels.AddRange(label);
                            outputContext.corpus_documents.Add(doc);
                        }
                    }
                }
            }

            if (notes != null)
            {
                notes.logEndPhase();
            }


            return(outputContext);
        }