/// <summary> /// Builds vectors from selected features and feature weighting model /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false) { List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList(); //FV.AddRange(); log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "]"); // preparing the model weightModel.PrepareTheModel(context.spaceModel, log); Int32 i = 0; Int32 s = context.spaceModel.documents.Count / 10; // building document VSM foreach (SpaceDocumentModel docModel in context.spaceModel.documents) { var wd = weightModel.GetWeights(FV, docModel, context.spaceModel); VectorDocument docVec = new VectorDocument(docModel.name); docVec.terms = wd; context.vectorSpace.documents.Add(docVec); if (i % s == 0) { Double r = i.GetRatio(context.spaceModel.documents.Count); log.log("[" + r.ToString("F2") + "]"); } i++; } if (constructCategories) { // logger.log(":: Creating VectorSpace instances for categories"); // building category VSM foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { var wd = weightModel.GetWeights(FV, catModel, context.spaceModel); VectorLabel catVec = new VectorLabel(catModel.name); catVec.terms = wd; context.vectorSpace.labels.Add(catVec); } } }
/// <summary> /// Builds vectors from selected features and feature weighting model /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> public void VectorSpaceConstruction(OperationContext context, ILogBuilder log, Boolean constructCategories = false) { List <string> FV = context.SelectedFeatures.GetKeys(); //.entries.Select(x => x.name).ToList(); //FV.AddRange(); log.log("Preparing Weight model [" + weightModel.GetSignature() + "] - feature selection [" + FV.Count() + "] "); // preparing the model weightModel.PrepareTheModel(context.spaceModel, log); // blanking anything existing in vector space context.vectorSpace = new VectorSpace(); List <SpaceDocumentModel> toBlendIntoVectors = DocumentBlenderFunctionExtension.GetDocumentToBlend(blender.options, context.spaceModel.documents, log); Int32 i = 0; Int32 s = toBlendIntoVectors.Count() / 5; Dictionary <String, List <VectorDocument> > labelToDocumentSets = new Dictionary <String, List <VectorDocument> >(); foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { labelToDocumentSets.Add(catModel.name, new List <VectorDocument>()); } Int32 unlabeled = 0; foreach (SpaceDocumentModel model in toBlendIntoVectors) { VectorDocument docVec = model.BlendToVector <VectorDocument>(weightModel, context.spaceModel, FV); //new VectorDocument(model.name); context.vectorSpace.documents.Add(docVec); if (constructCategories) { String l = model.labels.FirstOrDefault(); if (!l.isNullOrEmpty()) { if (labelToDocumentSets.ContainsKey(l)) { labelToDocumentSets[l].Add(docVec); } else { unlabeled++; // } } } if (i % s == 0) { Double r = i.GetRatio(context.spaceModel.documents.Count); log.log("Blending primary vectors [" + r.ToString("P2") + "] : [" + i + "/" + toBlendIntoVectors.Count + "]"); } i++; } if (constructCategories && (unlabeled > 0)) { log.log("Vectors [" + unlabeled + "] are unlabeled"); } if (constructCategories) { log.log(":: Creating VectorSpace instances for categories"); // building category VSM foreach (SpaceCategoryModel catModel in context.spaceModel.categories) { VectorLabel catVec = new VectorLabel(catModel.name); foreach (var docVec in labelToDocumentSets[catModel.name]) { catVec.terms.Merge(docVec.terms); } //= catModel.BlendToVector<VectorLabel>(weightModel, context.spaceModel, FV); //weightModel.GetWeights(FV, catModel, context.spaceModel); context.vectorSpace.labels.Add(catVec); } } if (weightModel != null) { weightModel.Dispose(); } }
/// <summary> /// Executes the plane method, invoking contained functions according to the settings /// </summary> /// <param name="inputContext">The input context - related to this plane.</param> /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param> /// <param name="logger">The logger.</param> /// <returns> /// Retur /// </returns> public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger) { notes.logStartPhase("[2] Corpus Plane - execution", ""); ICorpusPlaneContext context = (ICorpusPlaneContext)inputContext; VectorPlaneContext outputContext = new VectorPlaneContext(); outputContext.provider.StoreAndReceive(context); context.stemmContext = new StemmingContext(stemmer); Dictionary <String, SpaceDocumentModel> documentVsModel = new Dictionary <string, SpaceDocumentModel>(); // modelling the documents foreach (TextDocument doc in context.corpus_documents) { SpaceDocumentModel model = spaceConstructor.ConstructDocument(doc.content, doc.name, context.space, context.stemmContext, tokenizer); List <SpaceLabel> labels = spaceConstructor.GetLabels(doc.labels, context.space); Boolean isUnknownLabel = true; foreach (SpaceLabel label in labels) { if (label.name != SpaceLabel.UNKNOWN) { isUnknownLabel = false; } context.space.LabelToDocumentLinks.Add(label, model, 1); } context.space.documents.Add(model); if (!isUnknownLabel) { context.space.terms.MergeDictionary(model.terms); } documentVsModel.Add(doc.name, model); } if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_textrender)) { foreach (TextDocument doc in context.corpus_documents) { String prefix = doc.labels.FirstOrDefault(); if (prefix.isNullOrEmpty()) { prefix = SpaceLabel.UNKNOWN; } String fn = prefix + "_" + doc.name; String pth = notes.folder_entity.pathFor(fn.getFilename("txt"), imbSCI.Data.enums.getWritableFileMode.overwrite, "Textual representation of website [" + doc.name + "], produced by rendering and blending settings", true); doc.content.saveStringToFile(pth, imbSCI.Data.enums.getWritableFileMode.overwrite); } } if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_fold_stats)) { foreach (WebSiteDocumentsSet ds in context.dataset) { DataTable dt = ds.MakeTable(documentVsModel); notes.SaveDataTable(dt, notes.folder_entity); } var dt_vsm = context.space.LabelToDocumentLinks.MakeTable("LabelToDocument", "Relationships between labels and documents in the primary Vector Space Model"); notes.SaveDataTable(dt_vsm, notes.folder_corpus); } if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_corpusDictionary)) { notes.SaveDataTable(context.space.terms.MakeTable("corpus_stats", "Training set dictionary, after stemming", generalContext.DictionaryReportLimit), notes.folder_corpus); } #region SELECTING THE FEATURES // forming corpus global weight context.SelectedFeatures = new WeightDictionary(); List <KeyValuePair <string, double> > filter_result = filter.SelectFeatures(context.space); List <string> FV = new List <string>(); FV.AddRange(filter_result.Select(x => x.Key)); if (filter_result.Any()) { foreach (var pair in filter_result) { context.SelectedFeatures.AddEntry(pair.Key, pair.Value); } if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_selectedFeatures)) { notes.SaveDataTable(context.SelectedFeatures.MakeTable("selected_features", "Features selected for BoW construction", new List <string>() { filter.function.shortName }, generalContext.DictionaryReportLimit), notes.folder_corpus); } } else { logger.log("-- Feature selection function returned zero set. All features [" + context.space.terms.Count + "] are therefore accepted as selected."); var tkns = context.space.terms.GetTokens(); foreach (var tkn in tkns) { context.SelectedFeatures.AddEntry(tkn, 1); } } #endregion notes.log("Selected features [" + context.SelectedFeatures.entries.Count + "] by [" + filter.functionSettings.functionName + "]"); //context.space = //weightModel.Deploy(); outputContext.vectorSpace = new Vectors.VectorSpace(); foreach (SpaceLabel label in context.space.labels) { var docs = context.space.LabelToDocumentLinks.GetAllLinked(label); if (label.name != SpaceLabel.UNKNOWN) { SpaceCategoryModel categoryModel = new SpaceCategoryModel(label, docs); context.space.LabelToCategoryLinks.Add(label, categoryModel, 1); context.space.categories.Add(categoryModel); notes.log("Class [" + categoryModel.name + "] BoW model created - terms[" + categoryModel.terms.Count + "] "); } } outputContext.LabelToDocumentLinks = context.space.LabelToDocumentLinks; // preparing the model weightModel.PrepareTheModel(context.space); // logger.log(":: Creating VectorSpace instances for documents"); // building document VSM foreach (SpaceDocumentModel docModel in context.space.documents) { var wd = weightModel.GetWeights(FV, docModel, context.space); VectorDocument docVec = new VectorDocument(docModel.name); docVec.terms = wd; if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels)) { DataTable dt = wd.MakeTable("docVec_" + docModel.name, "Document vector model", null, 10000); notes.SaveDataTable(dt, notes.folder_vector); } outputContext.vectorSpace.documents.Add(docVec); } // logger.log(":: Creating VectorSpace instances for categories"); // building category VSM foreach (SpaceCategoryModel catModel in context.space.categories) { var wd = weightModel.GetWeights(FV, catModel, context.space); VectorLabel catVec = new VectorLabel(catModel.name); catVec.terms = wd; if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels)) { DataTable dt = wd.MakeTable("catVec_" + catModel.name, "Document vector model", null, 10000); notes.SaveDataTable(dt, notes.folder_vector); } outputContext.vectorSpace.labels.Add(catVec); } if (generalContext.reportOptions.HasFlag(PlanesReportOptions.report_documentBoWModels)) { foreach (SpaceCategoryModel catModel in context.space.categories) { var dt = catModel.terms.MakeTable("cat_" + catModel.name, "Vector Space BoW weighted model, representing a category"); notes.SaveDataTable(dt, notes.folder_vector); } } notes.logEndPhase(); return(outputContext); }