//public void Deploy(FeatureVectorDictionary dataset, ILogBuilder logger) //{ // Deploy(dataset.GetVectorsWithLabelID(d)) //} public void Deploy(FeatureSpace space, List <String> labels, ILogBuilder logger) { //List<String> labels = space.labelToDocumentAssociations.GetAllDistinctNames(true); List <FeatureVectorWithLabelID> dataset = new List <FeatureVectorWithLabelID>(); foreach (FeatureVector vec in space.documents) { var associated = space.labelToDocumentAssociations.GetAllLinked(vec); Int32 lbi = -1; FeatureVectorWithLabelID fvl = null; if (associated.Any()) { lbi = labels.IndexOf(associated.First().name); } else { lbi = labels.IndexOf(SpaceLabel.UNKNOWN); } fvl = new FeatureVectorWithLabelID(vec, lbi); dataset.Add(fvl); } Deploy(dataset, logger, labels); }
public override double Score(DocumentSelectResultEntry entry, DocumentSelectResult context, ILogBuilder log) { FeatureVectorWithLabelID fv = vectorDictionary.Get(entry.DomainID, entry.AssignedID); if (fv == null) { log.log("Can't find vector dictionary entry for [" + entry.DomainID + "]>[" + entry.AssignedID + "]"); return(0); } Double sc = 0; if (computation.HasFlag(ScoreComputationModeEnum.offset)) { sc = fv.CompressByTrueDimension(fv.labelID); } else if (computation.HasFlag(ScoreComputationModeEnum.variance)) { sc = fv.dimensions.GetVarianceCoefficient(); } else if (computation.HasFlag(ScoreComputationModeEnum.distance)) { sc = fv.CompressNumericVector(imbSCI.Core.enums.operation.max); } else { sc = fv.dimensions[0]; } if (computation.HasFlag(ScoreComputationModeEnum.inverse)) { sc = -sc; } if (sc == Double.NaN) { sc = 0; } else { } return(sc); }
public void PerformClassification(OperationContext context, ExperimentTruthTable truthTable, ClassificationDatasetSeparationEnum distributionRule, ILogBuilder log) { log.log("Performing classification"); if (truthTable == null) { truthTable = new ExperimentTruthTable(); notes.log(":: DEPLOYING IN-FOLD TRUTH TABLE ::"); truthTable.Deploy(context.featureSpace, context.spaceModel.labels.Select(x => x.name).ToList(), log); } DistributeTrainingAndTestSets(distributionRule, truthTable, context.featureSpace, log, context.testSet, context.trainingSet); if (!context.trainingSet.Any()) { notes.log("TRAINING SET EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same"); } else { notes.log("Training [" + classifier.name + "] with [" + context.trainingSet.Count + "] feature vectors."); classifier.DoTraining(context.trainingSet, log); log.log("Training [" + classifier.name + "] completed."); notes.log("Testing [" + classifier.name + "] with [" + context.testSet.Count + "] feature vectors."); context.testResults = new List <FeatureVectorWithLabelID>(); var ts = context.testSet.Select(x => x.vector); foreach (FeatureVector fv in ts) { Int32 result = classifier.DoSelect(fv, log); FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(fv, result); context.testResults.Add(fvl); } log.log("Testing [" + classifier.name + "] completed."); } }
/// <summary> /// Executes the plane method, invoking contained functions according to the settings /// </summary> /// <param name="inputContext">The input context - related to this plane.</param> /// <param name="generalContext">General execution context, attached to the <see cref="T:imbNLP.Toolkit.Planes.PlanesMethodDesign" /></param> /// <param name="logger">The logger.</param> /// <returns> /// Retur /// </returns> public IPlaneContext ExecutePlaneMethod(IPlaneContext inputContext, ExperimentModelExecutionContext generalContext, ILogBuilder logger) { notes.logStartPhase("[4] Feature Plane - execution", ""); IFeaturePlaneContext context = inputContext as IFeaturePlaneContext; foreach (FeatureVector vec in context.featureSpace.documents) { var associated = context.featureSpace.labelToDocumentAssociations.GetAllLinked(vec); if (associated.Any()) { Int32 lbi = generalContext.truthTable.labels_without_unknown.IndexOf(associated.First().name); FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(vec, lbi); context.trainingSet.Add(fvl); } else { context.testSet.Add(vec); } } if (!context.testSet.Any()) { notes.log("TEST SET IS EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same"); context.trainingSet.ForEach(x => context.testSet.Add(x.vector)); } if ((!context.trainingSet.Any())) { notes.log("TRAINING SET EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same"); } else { notes.log("Training [" + classifier.name + "] with [" + context.trainingSet.Count + "] feature vectors."); classifier.DoTraining(context.trainingSet, logger); notes.log("Testing [" + classifier.name + "] with [" + context.testSet.Count + "] feature vectors."); context.testResults = new List <FeatureVectorWithLabelID>(); foreach (FeatureVector fv in context.testSet) { Int32 result = classifier.DoSelect(fv, logger); FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(fv, result); context.testResults.Add(fvl); } /* * Dictionary<string, List<FeatureVectorWithLabelID>> byCategory = generalContext.truthTable.GroupByTrueCategory(context.testResults); * objectTable<classificationReport> tbl = new objectTable<classificationReport>(nameof(classificationReport.Name), "inclass_" + generalContext.runName); * classificationReport macroAverage = new classificationReport("AVG-" + generalContext.runName); * foreach (KeyValuePair<string, List<FeatureVectorWithLabelID>> pair in byCategory) * { * var cReport = generalContext.EvaluateTestResults(pair.Value, pair.Key + "-" + generalContext.runName, logger); * * cReport.Classifier = classifier.name; * cReport.Comment = "Tr/Ts [" + context.trainingSet.Count + "]:[" + context.testSet.Count + "]"; * String path = notes.folder_classification.pathFor(pair.Key + "_result.xml", imbSCI.Data.enums.getWritableFileMode.overwrite, "Serialized evaluation result within category [" + pair.Key + "]", true); * * macroAverage.AddValues(cReport); * * tbl.Add(cReport); * } * // macroAverage.DivideValues(byCategory.Keys.Count); * * tbl.Add(macroAverage); * * notes.SaveDataTable(tbl.GetDataTable(), notes.folder_classification); */ } notes.logEndPhase(); return(context); }
/// <summary> /// Prepares the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="log">The log.</param> /// <exception cref="ArgumentException">context</exception> public override void Prepare(DocumentSelectResult context, ILogBuilder log) { //context.folder.GetOrFindFiles("*", dictionaryFile + "*.xml"); scoreDictionary = FeatureVectorDictionaryWithDimensions.LoadFile(context.folder, dictionaryFile, log); // WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(dictionaryFile, context.folder), log); if (scoreDictionary == null) { String msg = "Error: Failed to find score dictionary [" + dictionaryFile + "] in " + context.folder.path; throw new ArgumentException(msg, nameof(context)); } if (useMachineLearning) { #region --------------- PREPARING TERM WEIGHT MODEL String p_m = FeatureWeightModel.GetModelDefinitionFilename(modelDefinitionFile, context.folder); String p_d = FeatureWeightModel.GetModelDataFilename(modelDefinitionFile, context.folder); if (TermWeightModel == null) { TermWeightModel = FeatureWeightModel.LoadModel(p_m, log); } TermWeightModel.Deploy(log); if (context.spaceModel == null) { String msg = "Error: TermWeight factor requires SpaceModel declared in the context for operation"; throw new ArgumentException(msg, nameof(context)); } if (File.Exists(p_d) && useStoredData) { WeightingModelDataSet data = objectSerialization.loadObjectFromXML <WeightingModelDataSet>(p_d, log); TermWeightModel.LoadModelDataSet(data, log); if (useSelectedFeatures) { SelectedTerms = WeightDictionary.LoadFile(WeightDictionary.GetDictionaryFilename(modelDefinitionFile + "_sf", context.folder), log); } } else { TermWeightModel.PrepareTheModel(context.spaceModel, log); } if (SelectedTerms.Count == 0) { SelectedTerms = context.selectedFeatures; } List <String> sel_tkns = new List <String>(); sel_tkns.AddRange(SelectedTerms.index.Values.Select(x => x.name)); if (!sel_tkns.Any()) { sel_tkns.AddRange(context.spaceModel.terms_known_label.GetTokens()); } #endregion fvConstructor.Deploy(featureMethod.constructor, sel_tkns); classifier = featureMethod.classifierSettings.GetClassifier(); sc_id = scoreDictionary.GetVectorsWithLabelID(null, criterion).ToNameVsLabelID(); List <FeatureVectorWithLabelID> trainingSet = new List <FeatureVectorWithLabelID>(); foreach (var item in context.items) { if (sc_id.ContainsKey(item.AssignedID)) { WeightDictionary dc_vec = TermWeightModel.GetWeights(sel_tkns, item.spaceDocument, context.spaceModel); var n_vec = fvConstructor.ConstructFeatureVector(dc_vec, item.AssignedID); FeatureVectorWithLabelID id_vec = new FeatureVectorWithLabelID(n_vec, sc_id[item.AssignedID]); trainingSet.Add(id_vec); } } log.log("Training [" + classifier.name + "] with [" + sc_id.Count + "] feature vectors."); classifier.DoTraining(trainingSet, log); } }
public void PerformClassification(OperationContext context, ExperimentTruthTable truthTable, ClassificationDatasetSeparationEnum distributionRule, ILogBuilder log) { log.log("Performing classification"); if (truthTable == null) { truthTable = new ExperimentTruthTable(); notes.log(":: DEPLOYING IN-FOLD TRUTH TABLE ::"); log.log(":: DEPLOYING IN-FOLD TRUTH TABLE ::"); truthTable.Deploy(context.featureSpace, context.spaceModel.labels.Select(x => x.name).ToList(), log); } DistributeTrainingAndTestSets(distributionRule, truthTable, context.featureSpace, log, context.testSet, context.trainingSet); if (!context.trainingSet.Any()) { notes.log("TRAINING SET EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same"); } else { notes.log("Training [" + classifier.name + "] with [" + context.trainingSet.Count + "] feature vectors."); classifier.DoTraining(context.trainingSet, log); log.log("Training [" + classifier.name + "] completed."); notes.log("Testing [" + classifier.name + "] with [" + context.testSet.Count + "] feature vectors."); context.testResults = new List <FeatureVectorWithLabelID>(); var ts = context.testSet.Select(x => x.vector); List <Int32> distinctResults = new List <int>(); foreach (FeatureVector fv in ts) { Int32 result = classifier.DoSelect(fv, log); if (!distinctResults.Contains(result)) { distinctResults.Add(result); } FeatureVectorWithLabelID fvl = new FeatureVectorWithLabelID(fv, result); context.testResults.Add(fvl); } if (distinctResults.Count < truthTable.labels_without_unknown.Count) { List <String> no_match_labels = truthTable.labels_without_unknown.ToList(); foreach (Int32 d in distinctResults) { no_match_labels.Remove(truthTable.labels_without_unknown[d]); } log.log("WARNING --- [" + classifier.name + "] ONLY [" + distinctResults.Count + "] of [" + truthTable.labels_without_unknown.Count + "] were assigned by the classifier"); foreach (String l in no_match_labels) { log.log("Class [" + l + "] received no assigment"); } foreach (var v in context.testSet) { var dist = v.GetDistinctValuesAtVector(); if (dist.Count < 2) { log.log("Test vector [" + v.name + "] has [" + dist.Count + "] distinct values at [" + v.dimensions.Length + "] dimensions!"); } } foreach (var v in context.trainingSet) { var dist = v.GetDistinctValuesAtVector(); if (dist.Count < 2) { log.log("Training vector [" + v.name + "] has [" + dist.Count + "] distinct values at [" + v.dimensions.Length + "] dimensions!"); } } } log.log("Testing [" + classifier.name + "] completed."); } }
//public ExperimentTruthTable ConstructTruthTable(FeatureSpace space, ILogBuilder log) //{ // ExperimentTruthTable output = new ExperimentTruthTable(); // output.Deploy(space, ) // List<String> labels = space.labelToDocumentAssociations.GetAllDistinctNames(true); // List<FeatureVectorWithLabelID> dataset = new List<FeatureVectorWithLabelID>(); // foreach (FeatureVector vec in space.documents) // { // var associated = space.labelToDocumentAssociations.GetAllLinked(vec); // Int32 lbi = -1; // FeatureVectorWithLabelID fvl = null; // if (associated.Any()) // { // lbi = labels.IndexOf(associated.First().name); // } // else // { // lbi = labels.IndexOf(SpaceLabel.UNKNOWN); // } // fvl = new FeatureVectorWithLabelID(vec, lbi); // dataset.Add(fvl); // } // output.Deploy(dataset, log); // return output; //} public void DistributeTrainingAndTestSets(ClassificationDatasetSeparationEnum distributionRule, ExperimentTruthTable truthTable, FeatureSpace featureSpace, ILogBuilder log, List <FeatureVectorWithLabelID> testSet, List <FeatureVectorWithLabelID> trainingSet) { log.log("Spliting data [" + distributionRule.ToString() + "]"); List <FeatureVectorWithLabelID> __testSet = new List <FeatureVectorWithLabelID>(); List <FeatureVectorWithLabelID> __trainingSet = new List <FeatureVectorWithLabelID>(); foreach (FeatureVector vec in featureSpace.documents) { var associated = featureSpace.labelToDocumentAssociations.GetAllLinked(vec); Int32 lbi = -1; FeatureVectorWithLabelID fvl = null; if (associated.Any()) { lbi = truthTable.labels_without_unknown.IndexOf(associated.First().name); } fvl = new FeatureVectorWithLabelID(vec, lbi); if (lbi == -1) { __testSet.Add(fvl); } else { __trainingSet.Add(fvl); } } if (!__testSet.Any()) { notes.log("TEST SET IS EMPTY ---- APPLYING 1:1 EXPERIMENT SHEME: training and test set are the same"); __trainingSet.ForEach(x => __testSet.Add(x)); } else { if (distributionRule.HasFlag(ClassificationDatasetSeparationEnum.TestLabeled)) { testSet.AddRange(__trainingSet, true); } if (distributionRule.HasFlag(ClassificationDatasetSeparationEnum.TestUnlabeled)) { testSet.AddRange(__testSet, true); } if (distributionRule.HasFlag(ClassificationDatasetSeparationEnum.TrainingLabeled)) { trainingSet.AddRange(__trainingSet, true); } if (distributionRule.HasFlag(ClassificationDatasetSeparationEnum.TrainingUnlabeled)) { trainingSet.AddRange(__testSet, true); } } log.log("Training [" + trainingSet.Count + "] - Testing [" + testSet.Count + "]"); //switch (distributionRule) //{ // case ClassificationDatasetSeparationEnum.TrainingAll_TestAll: // // break; // case ClassificationDatasetSeparationEnum.TrainingAll_TestUnlabeled: // break; // case ClassificationDatasetSeparationEnum.TrainingLabeled_TestAll: // trainingSet.ForEach(x => testSet.Add(x.vector)); // break; // case ClassificationDatasetSeparationEnum.TrainingLabeled_TestUnlabeled: // // just fine // break; //} }
/// <summary> /// Generates table from feature vectors /// </summary> /// <typeparam name="T"></typeparam> /// <param name="_vectors">The vectors.</param> /// <param name="dimensions">The dimensions.</param> /// <param name="name">The name.</param> /// <param name="description">The description.</param> /// <param name="labels">The labels.</param> /// <returns></returns> public static DataTable MakeTable <T>(this IEnumerable <T> _vectors, dimensionSpecificationSet dimensions, String name, String description, List <String> labels = null) where T : IVectorDimensions { DataTable table = new DataTable(); table.SetTitle(name); table.SetDescription(description); Boolean hasLabelID = false; T first = _vectors.FirstOrDefault(); if (first != null) { if (first is FeatureVectorWithLabelID) { hasLabelID = true; } } List <T> vectors = _vectors.ToList(); table.SetAdditionalInfoEntry("Documents", vectors.Count, "Total count of document vectors"); table.SetAdditionalInfoEntry("Dimensions", dimensions.Count, "Number of dimensions"); DataColumn column_rank = table.Add("Nr", "Order of appereance", "N", typeof(Int32), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(10); DataColumn column_token = table.Add("Name", "Name of the document vector", "Name", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50); List <DataColumn> dimension_col = new List <DataColumn>(); foreach (var dim in dimensions) { DataColumn dim_col = null; String prefix = dimension_col.Count.ToString("D3"); dim_col = table.Add(dim.name, dim.description, "D" + prefix, typeof(Double), imbSCI.Core.enums.dataPointImportance.important, "F5"); dimension_col.Add(dim_col); } DataColumn column_label = null; if (hasLabelID) { column_label = table.Add("Label", "Affiliation to a category", "Label", typeof(String), imbSCI.Core.enums.dataPointImportance.normal).SetWidth(50); } Int32 r = 1; foreach (T docVec in vectors) { var dr = table.NewRow(); dr[column_rank] = r; dr[column_token] = docVec.name; Int32 di = 0; foreach (DataColumn dc in dimension_col) { if (di < docVec.dimensions.Length) { Double val = docVec.dimensions[di]; dr[dc] = val; } di++; } if (hasLabelID) { Int32 lid = 0; String lbl_str = ""; FeatureVectorWithLabelID docVecWithLabel = docVec as FeatureVectorWithLabelID; if (docVecWithLabel != null) { lid = docVecWithLabel.labelID; } if (labels != null) { if (lid > -1 && lid < labels.Count) { lbl_str = labels[lid]; } else { lbl_str = lid.ToString(); } } dr[column_label] = lbl_str; } table.Rows.Add(dr); r++; } return(table); }