示例#1
0
        /// <summary>
        /// Builds vectors from selected features and feature weighting model
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false)
        {
            List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList();

            //FV.AddRange();

            log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "]");
            // preparing the model
            weightModel.PrepareTheModel(context.spaceModel, log);


            Int32 i = 0;
            Int32 s = context.spaceModel.documents.Count / 10;

            // building document VSM
            foreach (SpaceDocumentModel docModel in context.spaceModel.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.spaceModel);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                context.vectorSpace.documents.Add(docVec);
                if (i % s == 0)
                {
                    Double r = i.GetRatio(context.spaceModel.documents.Count);
                    log.log("[" + r.ToString("F2") + "]");
                }
                i++;
            }

            if (constructCategories)
            {
                // logger.log(":: Creating VectorSpace instances for categories");
                // building category VSM
                foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
                {
                    var         wd     = weightModel.GetWeights(FV, catModel, context.spaceModel);
                    VectorLabel catVec = new VectorLabel(catModel.name);
                    catVec.terms = wd;


                    context.vectorSpace.labels.Add(catVec);
                }
            }
        }
        /// <summary>
        /// Builds vectors from selected features and feature weighting model
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="log">The log.</param>
        public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false)
        {
            List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList();

            //FV.AddRange();

            log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "] ");
            // preparing the model
            weightModel.PrepareTheModel(context.spaceModel, log);

            // blanking anything existing in vector space
            context.vectorSpace = new VectorSpace();

            List <SpaceDocumentModel> toBlendIntoVectors = DocumentBlenderFunctionExtension.GetDocumentToBlend(blender.options, context.spaceModel.documents, log);

            Int32 i = 0;
            Int32 s = toBlendIntoVectors.Count() / 5;


            Dictionary <String, List <VectorDocument> > labelToDocumentSets = new Dictionary <String, List <VectorDocument> >();


            foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
            {
                labelToDocumentSets.Add(catModel.name, new List <VectorDocument>());
            }

            Int32 unlabeled = 0;

            foreach (SpaceDocumentModel model in toBlendIntoVectors)
            {
                VectorDocument docVec = model.BlendToVector <VectorDocument>(weightModel, context.spaceModel, FV);  //new VectorDocument(model.name);
                context.vectorSpace.documents.Add(docVec);

                if (constructCategories)
                {
                    String l = model.labels.FirstOrDefault();

                    if (!l.isNullOrEmpty())
                    {
                        if (labelToDocumentSets.ContainsKey(l))
                        {
                            labelToDocumentSets[l].Add(docVec);
                        }
                        else
                        {
                            unlabeled++;
                            //
                        }
                    }
                }


                if (i % s == 0)
                {
                    Double r = i.GetRatio(context.spaceModel.documents.Count);
                    log.log("Blending primary vectors [" + r.ToString("P2") + "] : [" + i + "/" + toBlendIntoVectors.Count + "]");
                }
                i++;
            }

            if (constructCategories && (unlabeled > 0))
            {
                log.log("Vectors [" + unlabeled + "] are unlabeled");
            }

            if (constructCategories)
            {
                log.log(":: Creating VectorSpace instances for categories");
                // building category VSM
                foreach (SpaceCategoryModel catModel in context.spaceModel.categories)
                {
                    VectorLabel catVec = new VectorLabel(catModel.name);
                    foreach (var docVec in labelToDocumentSets[catModel.name])
                    {
                        catVec.terms.Merge(docVec.terms);
                    }

                    //= catModel.BlendToVector<VectorLabel>(weightModel, context.spaceModel, FV); //weightModel.GetWeights(FV, catModel, context.spaceModel);

                    context.vectorSpace.labels.Add(catVec);
                }
            }


            if (weightModel != null)
            {
                weightModel.Dispose();
            }
        }
示例#3
0
        /// <summary>
        /// Executes the plane method, invoking contained functions according to the settings
        /// </summary>
        /// <param name="inputContext">The input context - related to this plane.</param>
        /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param>
        /// <param name="logger">The logger.</param>
        /// <returns>
        /// Retur
        /// </returns>
        public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger)
        {
            notes.logStartPhase("[2] Corpus Plane - execution", "");

            ICorpusPlaneContext context       = (ICorpusPlaneContext)inputContext;
            VectorPlaneContext  outputContext = new VectorPlaneContext();

            outputContext.provider.StoreAndReceive(context);

            context.stemmContext = new StemmingContext(stemmer);

            Dictionary <String, SpaceDocumentModel> documentVsModel = new Dictionary <string, SpaceDocumentModel>();

            // modelling the documents
            foreach (TextDocument doc in context.corpus_documents)
            {
                SpaceDocumentModel model          = spaceConstructor.ConstructDocument(doc.content, doc.name, context.space, context.stemmContext, tokenizer);
                List <SpaceLabel>  labels         = spaceConstructor.GetLabels(doc.labels, context.space);
                Boolean            isUnknownLabel = true;
                foreach (SpaceLabel label in labels)
                {
                    if (label.name != SpaceLabel.UNKNOWN)
                    {
                        isUnknownLabel = false;
                    }
                    context.space.LabelToDocumentLinks.Add(label, model, 1);
                }
                context.space.documents.Add(model);
                if (!isUnknownLabel)
                {
                    context.space.terms.MergeDictionary(model.terms);
                }

                documentVsModel.Add(doc.name, model);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_textrender))
            {
                foreach (TextDocument doc in context.corpus_documents)
                {
                    String prefix = doc.labels.FirstOrDefault();
                    if (prefix.isNullOrEmpty())
                    {
                        prefix = SpaceLabel.UNKNOWN;
                    }

                    String fn  = prefix + "_" + doc.name;
                    String pth = notes.folder_entity.pathFor(fn.getFilename("txt"), imbSCI.Data.enums.getWritableFileMode.overwrite, "Textual representation of website [" + doc.name + "], produced by rendering and blending settings", true);
                    doc.content.saveStringToFile(pth, imbSCI.Data.enums.getWritableFileMode.overwrite);
                }
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_stats))
            {
                foreach (WebSiteDocumentsSet ds in context.dataset)
                {
                    DataTable dt = ds.MakeTable(documentVsModel);
                    notes.SaveDataTable(dt, notes.folder_entity);
                }



                var dt_vsm = context.space.LabelToDocumentLinks.MakeTable("LabelToDocument", "Relationships between labels and documents in the primary Vector Space Model");
                notes.SaveDataTable(dt_vsm, notes.folder_corpus);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_corpusDictionary))
            {
                notes.SaveDataTable(context.space.terms.MakeTable("corpus_stats", "Training set dictionary, after stemming", generalContext.DictionaryReportLimit), notes.folder_corpus);
            }



            #region SELECTING THE FEATURES
            // forming corpus global weight
            context.SelectedFeatures = new WeightDictionary();
            List <KeyValuePair <string, double> > filter_result = filter.SelectFeatures(context.space);
            List <string> FV = new List <string>();
            FV.AddRange(filter_result.Select(x => x.Key));

            if (filter_result.Any())
            {
                foreach (var pair in filter_result)
                {
                    context.SelectedFeatures.AddEntry(pair.Key, pair.Value);
                }

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_selectedFeatures))
                {
                    notes.SaveDataTable(context.SelectedFeatures.MakeTable("selected_features", "Features selected for BoW construction", new List <string>()
                    {
                        filter.function.shortName
                    }, generalContext.DictionaryReportLimit), notes.folder_corpus);
                }
            }
            else
            {
                logger.log("-- Feature selection function returned zero set. All features [" + context.space.terms.Count + "] are therefore accepted as selected.");
                var tkns = context.space.terms.GetTokens();
                foreach (var tkn in tkns)
                {
                    context.SelectedFeatures.AddEntry(tkn, 1);
                }
            }
            #endregion


            notes.log("Selected features [" + context.SelectedFeatures.entries.Count + "] by [" + filter.functionSettings.functionName + "]");



            //context.space =
            //weightModel.Deploy();

            outputContext.vectorSpace = new Vectors.VectorSpace();


            foreach (SpaceLabel label in context.space.labels)
            {
                var docs = context.space.LabelToDocumentLinks.GetAllLinked(label);
                if (label.name != SpaceLabel.UNKNOWN)
                {
                    SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs);
                    context.space.LabelToCategoryLinks.Add(label, categoryModel, 1);

                    context.space.categories.Add(categoryModel);

                    notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] ");
                }
            }

            outputContext.LabelToDocumentLinks = context.space.LabelToDocumentLinks;

            // preparing the model
            weightModel.PrepareTheModel(context.space);

            // logger.log(":: Creating VectorSpace instances for documents");
            // building document VSM
            foreach (SpaceDocumentModel docModel in context.space.documents)
            {
                var            wd     = weightModel.GetWeights(FV, docModel, context.space);
                VectorDocument docVec = new VectorDocument(docModel.name);
                docVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("docVec_" + docModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
                outputContext.vectorSpace.documents.Add(docVec);
            }

            // logger.log(":: Creating VectorSpace instances for categories");
            // building category VSM
            foreach (SpaceCategoryModel catModel in context.space.categories)
            {
                var         wd     = weightModel.GetWeights(FV, catModel, context.space);
                VectorLabel catVec = new VectorLabel(catModel.name);
                catVec.terms = wd;

                if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
                {
                    DataTable dt = wd.MakeTable("catVec_" + catModel.name, "Document vector model", null, 10000);
                    notes.SaveDataTable(dt, notes.folder_vector);
                }

                outputContext.vectorSpace.labels.Add(catVec);
            }

            if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels))
            {
                foreach (SpaceCategoryModel catModel in context.space.categories)
                {
                    var dt = catModel.terms.MakeTable("cat_" + catModel.name, "Vector Space BoW weighted model, representing a category");
                    notes.SaveDataTable(dt, notes.folder_vector);
                }
            }


            notes.logEndPhase();

            return(outputContext);
        }