/// <summary> /// Loads the classifier provided by the <c>classifierFactory</c> which is modelled using the specified <c>model_file</c>. /// Then, the classifier is used to evaluate the accuracy of the <c>vector_data</c>. /// A report on the classification details is printed to the <c>output_file</c>. /// </summary> /// <param name="model_file">A file containing a serialization of the classifier model.</param> /// <param name="sys_output">A report on the classification details.</param> /// <param name="classifierFactory">Provides the necessary classifier.</param> internal static double ReportOnModel( FeatureVectorFile vectorFile , string sys_output , Func <TextIdMapper, TextIdMapper, Classifier> classifierFactory , Func <Classifier, List <FeatureVector>, TextIdMapper, TextIdMapper, string[]> getDetailsFunc ) { int gold_i = 0; TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; Classifier classifier = classifierFactory(classToClassId, featureToFeatureId); var vectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses = vectorFile.Headers[gold_i]; var systemClasses = classifier.Classify(vectors); string[] details = getDetailsFunc(classifier, vectors, classToClassId, featureToFeatureId); var accuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, vectors, classToClassId, goldClasses, systemClasses, details, heading: Path.GetFileName(vectorFile.Path)); return(accuracy); }
// Methods public override double ExecuteCommand() { int instanceName_i = 0; int gold_i = 1; featureToFeatureId = new TextIdMapper(); classToClassId = new TextIdMapper(); var instanceNameToId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { instanceNameToId, classToClassId }; FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 2, featureDelimiter: ' ', isSortRequired: false); // Read the boundaries: int[] sentenceLengths = ReadBoundaryFile(boundary_file); // Read the classifier model: classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId); // Read the vectors: var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous); // Get the output ready for display. int[] goldClassIds = vectorFile.Headers[gold_i]; int[] instanceNameIds = vectorFile.Headers[instanceName_i]; string[] instanceNames = instanceNameToId.GetValues(instanceNameIds); // Generate sys_output: ConfusionMatrix confusionMatrix; File.WriteAllText(sys_output, GenerateSysOutput(instanceNames, testVectors, sentenceLengths, out confusionMatrix, gold_i)); return(confusionMatrix.CalculateAccuracy()); }
private void Serialize_Recursive(StringBuilder sb, TextIdMapper classToClassId, TextIdMapper wordToWordId, int depth) { // If is is a leaf node, ... if (TrueBranch == null && FalseBranch == null) { string path = GetPath(wordToWordId); sb.AppendFormat("{0} {1}", path, FeatureVectors.Count); double[] distribution = GetDistributionByClass(); for (int i = 0; i < distribution.Length; i++) { sb.AppendFormat(" {0} {1}", classToClassId[i], distribution[i]); } sb.AppendLine(); } // If it is not a leaf node, ... else { if (FalseBranch != null) { FalseBranch.Serialize_Recursive(sb, classToClassId, wordToWordId, depth + 1); } if (TrueBranch != null) { TrueBranch.Serialize_Recursive(sb, classToClassId, wordToWordId, depth + 1); } } }
/// <summary>Loads a TBL classidier from the model_file at the specifiedl location.</summary> public static TBLClassifier LoadModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, int N, int gold_i) { int defaultClass; List <Transformation> transformations = new List <Transformation>(); using (StreamReader sr = File.OpenText(model_file)) { // Read the default class, which is presented in the first line: string line = sr.ReadLine(); defaultClass = classToClassId[line.Trim()]; // Read each of the transitions stored in the model file: Regex parser = new Regex(@"(?<featName>[^\s]+)\s+(?<from_class>[^\s]+)\s+(?<to_class>[^\s]+)\s+(?<net_gain>[^\s]+)"); while (!sr.EndOfStream && transformations.Count < N) { line = sr.ReadLine(); var match = parser.Match(line); int feat_id = featureToFeatureId[match.Groups["featName"].Value.Trim()]; int from_class_id = classToClassId[match.Groups["from_class"].Value.Trim()]; int to_class_id = classToClassId[match.Groups["to_class"].Value.Trim()]; int net_gain = int.Parse(match.Groups["net_gain"].Value.Trim()); transformations.Add(new Transformation(feat_id, from_class_id, to_class_id, net_gain)); } } // Set the minimum gain to -1 (an invalid value) to indicate that the model has been loaded. return(new TBLClassifier(transformations, classToClassId.Count, defaultClass, gold_i)); }
// Methods /// <summary>Reports the accuracy of the classifier, based on the specified <c>confusionMatrix</c>.</summary> /// <param name="confusionMatrix">The confusion matrix to report.</param> /// <param name="classToclassId">A lookup that maps class identifiers to their text representations.</param> /// <param name="reportTitle">A text description of the set of data being classfied.</param> private static double ReportAccuracy(ConfusionMatrix confusionMatrix, TextIdMapper classToclassId, string reportTitle) { // Write column headers: Console.WriteLine("Confusion matrix for '{0}':", reportTitle); Console.WriteLine("row is the truth, column is the system output"); Console.WriteLine(); Console.Write(" "); for (int i = 0; i < confusionMatrix.NoOfDimensions; i++) { Console.Write(" {0}", classToclassId[i]); } Console.WriteLine(); // Write rows. for (int i = 0; i < confusionMatrix.NoOfDimensions; i++) { // Write row header: Console.Write("{0}", classToclassId[i]); // Write cells: for (int j = 0; j < confusionMatrix.NoOfDimensions; j++) { Console.Write("\t{0}", confusionMatrix[i, j]); } Console.WriteLine(); } Console.WriteLine(); double accuracy = confusionMatrix.CalculateAccuracy(); Console.WriteLine($" {reportTitle} accuracy={accuracy:0.00000}"); Console.WriteLine(); return(accuracy); }
/// <summary>Saves this model to the specifiedl location.</summary> public void SaveModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId) { // Make sure that training has been performed. if (!HasTrained) { PerformTraining(); } using (StreamWriter sw = File.CreateText(model_file)) { // The first line contains the default classname (i.e., the first class in the training data), string defaultClassName = classToClassId[0]; sw.WriteLine($"{defaultClassName} "); // Then, write the list of transformation (1 x transformation per line), foreach (var transformation in _transformations) { // Each transformation line: format: featName from_class to_class net_gain string featName = featureToFeatureId[transformation.FeatureId]; string to_class = classToClassId[transformation.ToClass]; string from_class = classToClassId[transformation.FromClass]; int net_gain = transformation.NetGain; sw.WriteLine($"{featName} {from_class} {to_class} {net_gain}"); } } }
// Public Methods public override double ExecuteCommand() { FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true); FeatureVectorFile vectorFile_test = new FeatureVectorFile(path: test_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: true); int gold_i = 0; TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; var trainingVectors = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses_train = vectorFile_train.Headers[gold_i]; var testVectors = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses_test = vectorFile_test.Headers[gold_i]; var classifier = new kNNClassifier(k_val, (SimilarityFunction)similarity_func, trainingVectors, classToClassId.Count, gold_i); var systemClasses_train = classifier.Classify(trainingVectors); var systemClasses_test = classifier.Classify(testVectors); var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId); var details_test = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId); ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data"); var testAccuracy = ProgramOutput.GenerateSysOutput(sys_output, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data"); return(testAccuracy); }
/// <summary> /// Trains the classifier provided by the <c>classifierFactory</c> on the <c>training_data</c>. /// Then, the classifier is used to evaluate the accuracy of both the <c>training_data</c> and <c>test_data</c>. /// A report on the classification details is printed to the <c>output_file</c>. /// </summary> /// <param name="output_file">A report on the classification details.</param> /// <param name="classifierFactory">Provides the necessary classifier.</param> internal static void ReportOnTrainingAndTesting( FeatureVectorFile vectorFile_train , FeatureVectorFile vectorFile_test , string output_file , Func <List <FeatureVector>, int, int, Classifier> classifierFactory , Func <Classifier, List <FeatureVector>, TextIdMapper, string[]> getDetailsFunc ) { int gold_i = 0; TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; var trainingVectors = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses_train = vectorFile_train.Headers[gold_i]; var testVectors = vectorFile_test.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses_test = vectorFile_test.Headers[gold_i]; Classifier classifier = classifierFactory(trainingVectors, classToClassId.Count, gold_i); var systemClasses_train = classifier.Classify(trainingVectors); var systemClasses_test = classifier.Classify(testVectors); var details_train = ProgramOutput.GetDistributionDetails(classifier, trainingVectors, classToClassId); var details_test = ProgramOutput.GetDistributionDetails(classifier, testVectors, classToClassId); ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.CreateNew, trainingVectors, classToClassId, goldClasses_train, systemClasses_train, details_train, "training data"); ProgramOutput.GenerateSysOutput(output_file, FileCreationMode.Append, testVectors, classToClassId, goldClasses_test, systemClasses_test, details_test, "test data"); }
// Methods internal string GetModelAsText(TextIdMapper classToClassId, TextIdMapper wordToWordId) { /// TODO: Move a serialization method to the classifier class. StringBuilder sb = new StringBuilder(); Serialize_Recursive(sb, classToClassId, wordToWordId, 0); return(sb.ToString()); }
// Static Methods public new static MaxEntPOSClassifier LoadModel(string model_file, TextIdMapper classToClassId, TextIdMapper featureToFeatureId) { string text = File.ReadAllText(model_file); List <double> lambda_c; List <FeatureVector> vectors; LoadModel(text, classToClassId, featureToFeatureId, out lambda_c, out vectors); return(new MaxEntPOSClassifier(vectors, classToClassId.Count, lambda_c.ToArray())); }
// Private Methods /// <summary> /// Outputs the classification result as follows: {instanceName} {gold_class_label} {sys_class_label} {probability}. /// </summary> /// <param name="output_file">The location of the sys_output file.</param> /// <param name="vectors">A collection of vectors to classify.</param> /// <param name="classToclassId">A class for providing human-readable class labels.</param> /// <param name="heading">Usually, "Training" or "Test".</param> internal static double GenerateSysOutput( string output_file , FileCreationMode fileCreationMode , List <FeatureVector> vectors , TextIdMapper classToclassId , int[] goldClasses , int[] systemClasses , string[] details , string heading ) { Debug.Assert(vectors != null && vectors.Count > 0); Debug.Assert(systemClasses != null && systemClasses.Length == vectors.Count); StreamWriter writer = null; try { switch (fileCreationMode) { case FileCreationMode.CreateNew: writer = File.CreateText(output_file); break; case FileCreationMode.Append: writer = File.AppendText(output_file); break; default: throw new Exception($"Internal error: ProgramOutput.FileCreationMode with value '{fileCreationMode}' is not supported by this version of the application."); } writer.Write($"%%%%% {heading}:{Environment.NewLine}"); // For each of the vectors, ... var confusionMatrix = new ConfusionMatrix(classToclassId.Count); for (int v_i = 0; v_i < vectors.Count; v_i++) { string trueLabel = classToclassId[goldClasses[v_i]]; string sysLabel = classToclassId[systemClasses[v_i]]; // Output the {true_class_label} {details} writer.WriteLine($"{trueLabel}\t{sysLabel}\t{details[v_i]}"); confusionMatrix[goldClasses[v_i], systemClasses[v_i]]++; } writer.WriteLine(); double accuracy = ReportAccuracy(confusionMatrix, classToclassId, heading); return(accuracy); } finally { if (writer != null) { writer.Close(); } } }
// Methods public override bool ExecuteCommand() { // Initialize the text-to-Id mappers: int gold_i = 0; featureToFeatureId = new TextIdMapper(); classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; // Workaround: Read everything from STDIN to a file. (Files are used as the text source throughout this application.) var svmLight_data = Console.In.ReadToEnd(); Console.Error.WriteLine("{0} characters of input received.", svmLight_data.Length); string tempFile = Path.GetTempFileName(); int[] goldClasses; List <FeatureVector> vectors; try { File.WriteAllText(tempFile, svmLight_data); FeatureVectorFile vectorFile = new FeatureVectorFile(path: tempFile, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false); vectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); goldClasses = vectorFile.Headers[gold_i]; } finally { File.Delete(tempFile); } Debug.Assert(vectors.Count > 0); IdValuePair <double>[] chiSquare = new IdValuePair <double> [featureToFeatureId.Count]; //TODO: Make the implementation less binary dependent (i.e. the hardcoded 2 below). double[][,] contingencyTable_f = new double[featureToFeatureId.Count][, ]; for (int f_i = 0; f_i < featureToFeatureId.Count; f_i++) { // Create a contingency table for this vector. contingencyTable_f[f_i] = new double[classToClassId.Count, 2]; for (int v_i = 0; v_i < vectors.Count; v_i++) { FeatureVector v = vectors[v_i]; contingencyTable_f[f_i][v.Headers[gold_i], (int)v.Features[f_i]]++; } chiSquare[f_i] = new IdValuePair <double>(f_i, StatisticsHelper.CalculateChiSquare(contingencyTable_f[f_i])); } ReportChiSquareResults(contingencyTable_f, chiSquare); return(true); }
// Public Methods /// <summary> /// Displays the vector /// </summary> /// <param name="featureToFeatureId"></param> /// <returns></returns> public string Display(TextIdMapper featureToFeatureId) { StringBuilder sb = new StringBuilder(); bool isFirst = true; foreach (int u_i in UsedFeatures) { if (isFirst) { isFirst = false; } else { sb.AppendFormat(" "); } sb.AppendFormat("{0}:{1:0.#####}", featureToFeatureId[u_i], Features[u_i]); } return(sb.ToString()); }
// Methods public override double ExecuteCommand() { FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: false); // Initialize the text-to-Id mappers: featureToFeatureId = new TextIdMapper(); int instanceName_i = 0; int gold_i = 1; classToClassId = new TextIdMapper(); var instanceNameToInstanceNameId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { instanceNameToInstanceNameId , classToClassId }; // Read the boundaries: int[] sentenceLengths = ReadBoundaryFile(boundary_file); // Read the classifier model: classifier = MaxEntPOSClassifier.LoadModel(model_file, classToClassId, featureToFeatureId); // Read the vectors: var testVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous); int[] instanceNameIds = vectorFile.Headers[instanceName_i]; int[] goldClasses = vectorFile.Headers[gold_i]; // TODO: Neaten this up a little. string[] instanceNames = new string[instanceNameIds.Length]; for (int i = 0; i < instanceNameIds.Length; i++) { int instanceNameId = instanceNameIds[i]; instanceNames[i] = headerToHeaderIds[instanceName_i][i]; } // Generate sys_output: var confusionMatrix = GenerateSysOutput(sys_output, instanceNames, testVectors, sentenceLengths, gold_i); return(confusionMatrix.CalculateAccuracy()); }
// Methods public override bool ExecuteCommand() { FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false); // Load the training file. int gold_i = 0; TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; var trainingVectors = vectorFile.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses = vectorFile.Headers[gold_i]; double[,] observation, expectation; CalculateObservationAndEmpiricalExpectation(trainingVectors, out observation, out expectation); OutputEmpiricalCount(observation, expectation); return(true); }
public override double ExecuteCommand() { FeatureVectorFile vectorFile = new FeatureVectorFile(path: vector_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: true); FeatureVectorFile modelFile = new FeatureVectorFile(path: model_file, noOfHeaderColumns: 1, featureDelimiter: ':', isSortRequired: true); int alphaColumn_i = 0; TextIdMapper[] headerToHeaderIds_model = new TextIdMapper[modelFile.NoOfHeaderColumns]; headerToHeaderIds_model[alphaColumn_i] = new TextIdMapper(); var accuracy = Program.ReportOnModel(vectorFile, sys_output , classifierFactory: (classToClassId, featureToFeatureId) => { return(SVMClassifier.LoadModel(modelFile, classToClassId, featureToFeatureId, alphaColumn_i, headerToHeaderIds_model)); } , getDetailsFunc: GetDetails ); return(accuracy); }
public string GetPath(TextIdMapper wordToWordId) { if (Parent == null) { //Debug.Assert(depth == 0); return(string.Empty); } string featureName = wordToWordId[this.Parent.f_i]; if (object.ReferenceEquals(this, Parent.FalseBranch)) { featureName = "!" + featureName; } string parentFeature = Parent.GetPath(wordToWordId); if (parentFeature == string.Empty) { return(featureName); } return(parentFeature + "&" + featureName); }
/// <summary> /// Trains a classifier on the specified <c>train_data</c>. /// Output the model to the specified <c>model_file</c>. /// </summary> /// <param name="model_file">A file containing a serialization of the classifier model.</param> /// <param name="classifierFactory">Provides the necessary classifier, which must implement ISaveModel.</param> internal static void TrainModel <T>( FeatureVectorFile vector_file , string model_file , Func <List <FeatureVector>, TextIdMapper, TextIdMapper, T> classifierFactory ) where T : Classifier, ISaveModel { TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; var vectors = vector_file.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); T classifier = classifierFactory(vectors, classToClassId, featureToFeatureId); //var systemClasses = classifier.Classify(vectors); classifier.SaveModel(model_file, classToClassId, featureToFeatureId); }
// Methods public override bool ExecuteCommand() { FeatureVectorFile vectorFile_train = new FeatureVectorFile(path: training_data_file, noOfHeaderColumns: 1, featureDelimiter: ' ', isSortRequired: false); int gold_i = 0; TextIdMapper featureToFeatureId = new TextIdMapper(); TextIdMapper classToClassId = new TextIdMapper(); TextIdMapper[] headerToHeaderIds = new TextIdMapper[] { classToClassId }; var trainingVectors = vectorFile_train.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Binary); var goldClasses_train = vectorFile_train.Headers[gold_i]; // model_file is optional. Func <int, FeatureVector, double> calculate_Prob_c_v; // If it is not given, p(v|c_i) = 1/|C|, where |C| is the number of class_labels. if (string.IsNullOrWhiteSpace(model_file)) { double kProbability = 1D / classToClassId.Count; calculate_Prob_c_v = (v, c_i) => { return(kProbability); }; } // If it is given, it is used to calculate p(y|xi). else { MaxEntClassifier classifier = MaxEntClassifier.LoadModel(model_file, classToClassId, featureToFeatureId); calculate_Prob_c_v = (c_i, v) => { double[] details; int sysClass = classifier.Classify(v, out details); return(details[c_i]); }; } double[,] expectation = CalculateModelExpectation(trainingVectors, calculate_Prob_c_v); OutputEmpiricalCount(expectation, trainingVectors.Count, requiresSort: true); return(true); }
private static string[] GetDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId, TextIdMapper featureToFeatureId) { TBLClassifier tblClassifier = (TBLClassifier)classifier; var systemClasses = new int[vectors.Count]; var details = new string[vectors.Count]; for (int v_i = 0; v_i < vectors.Count; v_i++) { StringBuilder sb = new StringBuilder(); int currentClass = tblClassifier.DefaultClass; foreach (TBLClassifier.Transformation t in tblClassifier.Transformations) { int newClass = tblClassifier.Transform(currentClass, t, vectors[v_i]); if (newClass == currentClass) { continue; } string featName = featureToFeatureId[t.FeatureId]; string from_class = classToClassId[t.FromClass]; string to_class = classToClassId[t.ToClass]; sb.AppendFormat($" {featName} {from_class} {to_class}"); currentClass = newClass; } systemClasses[v_i] = currentClass; details[v_i] = sb.ToString(); } return(details); }
public static string[] GetDistributionDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId) { string[] details = new string[vectors.Count]; for (int v_i = 0; v_i < vectors.Count; v_i++) { double[] distribution; //int sysClass = classifier.Classify(vectors[v_i], out distribution); var distribution_sorted = SearchHelper.GetMaxNItems(distribution.Length, distribution); // Output the results: StringBuilder sb = new StringBuilder(); foreach (var classId in distribution_sorted) { // Output the results: sb.AppendFormat("\t{0}\t{1:0.00000}", classToClassId[classId], distribution[classId]); } details[v_i] = sb.ToString(); } return(details); }
protected static void LoadModel(string text, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, out List <double> lambda_c, out List <FeatureVector> vectors) { var probability_c_uf = new Dictionary <int, Dictionary <int, double> >(); lambda_c = new List <double>(); int classId = -1; string className = null; Regex classNamePattern = new Regex(@"FEATURES FOR CLASS (?<className>.+)"); Regex featurePattern = new Regex(@"(?<feature>\S+)\s+(?<probability>.+)"); int lineNo = 0; foreach (var line in TextHelper.SplitOnNewline(text)) { lineNo++; Match match = classNamePattern.Match(line); // Branch A: Update the class name. if (match.Groups.Count > 1) { className = match.Groups["className"].Value; int newClassId = classToClassId[className]; // If the class changes, make sure that the dictionary for it exists. if (newClassId != classId) { if (probability_c_uf.ContainsKey(newClassId)) { Console.Error.WriteLine("Line {0}:\t Category {1} might be listed twice.", lineNo, className); } else { probability_c_uf[newClassId] = new Dictionary <int, double>(); } } classId = newClassId; } // Branch B: Add a new feature. else { Debug.Assert(classId != -1); Match featureMatch = featurePattern.Match(line); if (featureMatch.Groups.Count > 2) { string featureName = featureMatch.Groups["feature"].Value; double probability = double.Parse(featureMatch.Groups["probability"].Value); // Treat the default values slightly differently. if (featureName == "<default>") { Debug.Assert(classId == lambda_c.Count); lambda_c.Add(probability); } else { int featureId = featureToFeatureId[featureName]; // Check that the inner dictionary exists. if (probability_c_uf[classId].ContainsKey(featureId)) { Console.Error.WriteLine("Line {0}:\tFeature: {1} appears twice in category {2}.", lineNo, featureName, className); } probability_c_uf[classId][featureId] = probability; } } } } // Create feature vectors based on the information we've extracted. vectors = new List <FeatureVector>(); foreach (int c_i in probability_c_uf.Keys) { ValueCollection features = new ValueCollection(featureToFeatureId.Count); foreach (int usedFeatureId in probability_c_uf[c_i].Keys) { features[usedFeatureId] = probability_c_uf[c_i][usedFeatureId]; } FeatureVector vector = new FeatureVector(new int[] { c_i }, features, probability_c_uf[c_i].Keys.ToArray(), false); vectors.Add(vector); } }
/// <summary>Loads and returns a collection of FeatureVectors from the specified <c>uri</c>.</summary> /// <param name="uri">A file, storing the features in SVM format.</param> /// <param name="featureToFeatureId"> /// A mapping between the feature's text values and internal numeric identifiers that represents these value. /// </param> /// <param name="classToClassId"> /// A mapping between class's names and internal numeric identifiers that represents these class names. /// </param> /// <param name="transformationCount"></param> /// <returns></returns> public List <FeatureVector> LoadFromSVMLight( TextIdMapper featureToFeatureId , TextIdMapper[] headerToHeaderIds , FeatureType featureType) { Debug.Assert(headerToHeaderIds != null && headerToHeaderIds.Length == this.NoOfHeaderColumns); // Step 1: Read the data file: string[] lines = File.ReadAllLines(this.Path); var wordBags_i = new List <Dictionary <int, int> >(); // Now that we know the number of lines, we can create the arrays for storing the header columns. for (int j = 0; j < Headers.Length; j++) { Headers[j] = new int[lines.Length]; Debug.Assert(headerToHeaderIds[j] != null); } // Store the header rows: HeaderRows = new string[NoOfHeaderRows]; for (int i = 0; i < NoOfHeaderRows; i++) { HeaderRows[i] = lines[i]; } // Parse 1: Iterate over each of the rows: for (int i = NoOfHeaderRows; i < lines.Length; i++) { string line = lines[i]; var chunks = TextHelper.SplitOnWhitespaceOr(line, FeatureDelimiter); // The first chunk contains the class: int j = 0; for (; j < Headers.Length; j++) { Headers[j][i - NoOfHeaderRows] = headerToHeaderIds[j][chunks[j]]; } // For each of the words in the document, ... var wordToWordCount = new Dictionary <int, int>(); for (; j < chunks.Length; j += 2) { int count = Int32.Parse(chunks[j + 1]); var featureId = featureToFeatureId[chunks[j]]; // Add this count to the existing sum: int sum; if (!wordToWordCount.TryGetValue(featureId, out sum)) { sum = 0; } wordToWordCount[featureId] = sum + count; } wordBags_i.Add(wordToWordCount); } // Parse 2: // This array is a matrix where each row represents a class and each column represents a word in our dictionary // (where the dictionary itself is a dictionary of ALL words in ALL classes). var vectors = new List <FeatureVector>(); for (int i = NoOfHeaderRows; i < lines.Length; i++) { var wordCounts = wordBags_i[i - NoOfHeaderRows]; var allFeatures = new ValueCollection(featureToFeatureId.Count); var usedFeatures = new int[wordCounts.Keys.Count]; int[] headers_j = new int[NoOfHeaderColumns]; for (int j = 0; j < NoOfHeaderColumns; j++) { headers_j[j] = Headers[j][i - NoOfHeaderRows]; } int w_i = 0; foreach (int f_i in wordCounts.Keys) { allFeatures[f_i] = GetFeatureValue(featureType, wordCounts[f_i]); usedFeatures[w_i++] = f_i; } vectors.Add(new FeatureVector(headers_j, allFeatures, usedFeatures, IsSortRequired)); } return(vectors); }
private static string[] GetDetails(Classifier classifier, List <FeatureVector> vectors, TextIdMapper classToClassId, TextIdMapper featureToFeatureId) { var detailsAsText = new string[vectors.Count]; for (int v_i = 0; v_i < vectors.Count; v_i++) { double[] details; //int sysClass = classifier.Classify(vectors[v_i], out details); Debug.Assert(details.Length == 1); detailsAsText[v_i] = string.Format($"{details[0]:0.00000}"); } return(detailsAsText); }
public static Classifier LoadModel(FeatureVectorFile vectorFile_model, TextIdMapper classToclassId, TextIdMapper featureToFeatureId, int alphaColumn_i, TextIdMapper[] headerToHeaderIds) { // Peek into the file to see what type of SVM model this is: int i = 0; LibSVM_KernelType kernel_type = LibSVM_KernelType.linear; foreach (var line in File.ReadLines(vectorFile_model.Path)) { if (i == 0) { Debug.Assert(line.StartsWith("svm_type") && line.EndsWith("c_svc")); } else if (i == 1) { kernel_type = (LibSVM_KernelType)Enum.Parse(typeof(LibSVM_KernelType), line.Substring(line.LastIndexOfAny(TextHelper.WhiteSpace))); } else { break; } i++; } // Override the number of header rows according to the model type. switch (kernel_type) { case LibSVM_KernelType.linear: vectorFile_model.NoOfHeaderRows = 8; break; case LibSVM_KernelType.polynomial: vectorFile_model.NoOfHeaderRows = 11; break; case LibSVM_KernelType.rbf: vectorFile_model.NoOfHeaderRows = 9; break; case LibSVM_KernelType.sigmoid: vectorFile_model.NoOfHeaderRows = 10; break; default: throw new NotImplementedException(); } // Read each of the support vectors: var modelVectors = vectorFile_model.LoadFromSVMLight(featureToFeatureId, headerToHeaderIds, FeatureType.Continuous); // Read the model file header: double rho = 0; double gamma = 0; double coef = 0; double degree = 0; Debug.Assert(vectorFile_model.HeaderRows[vectorFile_model.NoOfHeaderRows - 1] == "SV"); for (i = 2; i < vectorFile_model.NoOfHeaderRows - 1; i++) { string line = vectorFile_model.HeaderRows[i]; // Ignore non-informative meta-data: if (line.StartsWith("nr_class") || line.StartsWith("total_sv") || line.StartsWith("label") || line.StartsWith("nr_sv")) { continue; } string text = line.Substring(line.LastIndexOfAny(TextHelper.WhiteSpace)); if (line.StartsWith("rho")) { rho = double.Parse(text); } else if (line.StartsWith("gamma")) { gamma = double.Parse(text); } else if (line.StartsWith("degree")) { degree = double.Parse(text); } else if (line.StartsWith("coef")) { coef = double.Parse(text); } else { throw new NotImplementedException(); } } double[] weights = new double[modelVectors.Count]; for (i = 0; i < weights.Length; i++) { weights[i] = Convert.ToDouble(headerToHeaderIds[alphaColumn_i][vectorFile_model.Headers[alphaColumn_i][i]]); } switch (kernel_type) { case LibSVM_KernelType.linear: return(new LibSVMClassifier_Linear(modelVectors, weights, rho)); case LibSVM_KernelType.polynomial: return(new LibSVMClassifier_Polynomial(modelVectors, weights, rho, degree, gamma, coef)); case LibSVM_KernelType.rbf: return(new LibSVMClassifier_RBF(modelVectors, weights, rho, gamma)); case LibSVM_KernelType.sigmoid: return(new LibSVMClassifier_Sigmoid(modelVectors, weights, rho, gamma, coef)); default: throw new NotImplementedException(); } }