/// <summary>
        /// Spaces the model categories.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void SpaceModelCategories(OperationContext context, ILogBuilder log)
        {
            log.log("Space model categories");
            foreach (SpaceLabel label in context.spaceModel.labels)
            {
                if (label.name != SpaceLabel.UNKNOWN)
                {
                    var docs = context.spaceModel.LabelToDocumentLinks.GetAllLinked(label);

                    SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs);
                    context.spaceModel.LabelToCategoryLinks.Add(label, categoryModel, 1);

                    context.spaceModel.categories.Add(categoryModel);

                    // notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] ");
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[2] Corpus Plane - execution", "");

            ICorpusPlaneContext context       = (ICorpusPlaneContext)inputContext;
            VectorPlaneContext  outputContext = new VectorPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            context.stemmContext = new StemmingContext(stemmer);

            Dictionary <String, SpaceDocumentModel> documentVsModel = new Dictionary <string, SpaceDocumentModel>();

            // modelling the documents
            foreach (TextDocument doc in context.corpus_documents)
            {
                SpaceDocumentModel model          = spaceConstructor.ConstructDocument(doc.content, doc.name, context.space, context.stemmContext, tokenizer);
                List <SpaceLabel>  labels         = spaceConstructor.GetLabels(doc.labels, context.space);
                Boolean            isUnknownLabel = true;
                foreach (SpaceLabel label in labels)
                {
                    if (label.name != SpaceLabel.UNKNOWN)
                    {
                        isUnknownLabel = false;
                    }
                    context.space.LabelToDocumentLinks.Add(label, model, 1);
                }
                context.space.documents.Add(model);
                if (!isUnknownLabel)
                {
                    context.space.terms.MergeDictionary(model.terms);
                }

                documentVsModel.Add(doc.name, model);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_textrender))
            {
                foreach (TextDocument doc in context.corpus_documents)
                {
                    String prefix = doc.labels.FirstOrDefault();
                    if (prefix.isNullOrEmpty())
                    {
                        prefix = SpaceLabel.UNKNOWN;
                    }

                    String fn  = prefix + "_" + doc.name;
                    String pth = notes.folder_entity.pathFor(fn.getFilename("txt"), imbSCI.Data.enums.getWritableFileMode.overwrite, "Textual representation of website [" + doc.name + "], produced by rendering and blending settings", true);
                    doc.content.saveStringToFile(pth, imbSCI.Data.enums.getWritableFileMode.overwrite);
                }
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_stats))
            {
                foreach (WebSiteDocumentsSet ds in context.dataset)
                {
                    DataTable dt = ds.MakeTable(documentVsModel);
                    notes.SaveDataTable(dt, notes.folder_entity);
                }



                var dt_vsm = context.space.LabelToDocumentLinks.MakeTable("LabelToDocument", "Relationships between labels and documents in the primary Vector Space Model");
                notes.SaveDataTable(dt_vsm, notes.folder_corpus);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_corpusDictionary))
            {
                notes.SaveDataTable(context.space.terms.MakeTable("corpus_stats", "Training set dictionary, after stemming", generalContext.DictionaryReportLimit), notes.folder_corpus);
            }



            #region SELECTING THE FEATURES
            // forming corpus global weight
            context.SelectedFeatures = new WeightDictionary();
            List <KeyValuePair <string, double> > filter_result = filter.SelectFeatures(context.space);
            List <string> FV = new List <string>();
            FV.AddRange(filter_result.Select(x => x.Key));

            if (filter_result.Any())
            {
                foreach (var pair in filter_result)
                {
                    context.SelectedFeatures.AddEntry(pair.Key, pair.Value);
                }

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_selectedFeatures))
                {
                    notes.SaveDataTable(context.SelectedFeatures.MakeTable("selected_features", "Features selected for BoW construction", new List <string>()
                    {
                        filter.function.shortName
                    }, generalContext.DictionaryReportLimit), notes.folder_corpus);
                }
            }
            else
            {
                logger.log("-- Feature selection function returned zero set. All features [" + context.space.terms.Count + "] are therefore accepted as selected.");
                var tkns = context.space.terms.GetTokens();
                foreach (var tkn in tkns)
                {
                    context.SelectedFeatures.AddEntry(tkn, 1);
                }
            }
            #endregion


            notes.log("Selected features [" + context.SelectedFeatures.entries.Count + "] by [" + filter.functionSettings.functionName + "]");



            //context.space =
            //weightModel.Deploy();

            outputContext.vectorSpace = new Vectors.VectorSpace();


            foreach (SpaceLabel label in context.space.labels)
            {
                var docs = context.space.LabelToDocumentLinks.GetAllLinked(label);
                if (label.name != SpaceLabel.UNKNOWN)
                {
                    SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs);
                    context.space.LabelToCategoryLinks.Add(label, categoryModel, 1);

                    context.space.categories.Add(categoryModel);

                    notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] ");
                }
            }

            outputContext.LabelToDocumentLinks = context.space.LabelToDocumentLinks;

            // preparing the model
            weightModel.PrepareTheModel(context.space);

            // logger.log(":: Creating VectorSpace instances for documents");
            // building document VSM
            foreach (SpaceDocumentModel docModel in context.space.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.space);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("docVec_" + docModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
                outputContext.vectorSpace.documents.Add(docVec);
            }

            // logger.log(":: Creating VectorSpace instances for categories");
            // building category VSM
            foreach (SpaceCategoryModel catModel in context.space.categories)
            {
                var         wd     = weightModel.GetWeights(FV, catModel, context.space);
                VectorLabel catVec = new VectorLabel(catModel.name);
                catVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("catVec_" + catModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }

                outputContext.vectorSpace.labels.Add(catVec);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
            {
                foreach (SpaceCategoryModel catModel in context.space.categories)
                {
                    var dt = catModel.terms.MakeTable("cat_" + catModel.name, "Vector Space BoW weighted model, representing a category");
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
            }


            notes.logEndPhase();

            return(outputContext);
        }