// Construction public FeatureVector(int[] headers, ValueCollection features, int[] usedFeatures, bool sortUsedFeatures) { Headers = headers; Features = features; UsedFeatures = usedFeatures; // Sometimes, it is preferable to have the features sorted. In these cases, sort the features. if (sortUsedFeatures) { SortHelper.QuickSort(UsedFeatures); isSorted = true; } // Optimization: The text representation and hash code are cached to speed up dictionary lookups etc. StringBuilder sb = new StringBuilder(); sb.Append("{"); bool isFirst = true; for (int w_i = 0; w_i < UsedFeatures.Length; w_i++) { if (isFirst) { isFirst = false; } else { sb.Append(", "); } int f_i = UsedFeatures[w_i]; sb.AppendFormat("{0}:{1}", f_i, Features[f_i]); } sb.AppendLine("}"); _text = sb.ToString(); _hashCode = _text.GetHashCode(); }
/// <summary>Loads and returns a collection of FeatureVectors from the specified <c>uri</c>.</summary> /// <param name="uri">A file, storing the features in SVM format.</param> /// <param name="featureToFeatureId"> /// A mapping between the feature's text values and internal numeric identifiers that represents these value. /// </param> /// <param name="classToClassId"> /// A mapping between class's names and internal numeric identifiers that represents these class names. /// </param> /// <param name="transformationCount"></param> /// <returns></returns> public List <FeatureVector> LoadFromSVMLight( TextIdMapper featureToFeatureId , TextIdMapper[] headerToHeaderIds , FeatureType featureType) { Debug.Assert(headerToHeaderIds != null && headerToHeaderIds.Length == this.NoOfHeaderColumns); // Step 1: Read the data file: string[] lines = File.ReadAllLines(this.Path); var wordBags_i = new List <Dictionary <int, int> >(); // Now that we know the number of lines, we can create the arrays for storing the header columns. for (int j = 0; j < Headers.Length; j++) { Headers[j] = new int[lines.Length]; Debug.Assert(headerToHeaderIds[j] != null); } // Store the header rows: HeaderRows = new string[NoOfHeaderRows]; for (int i = 0; i < NoOfHeaderRows; i++) { HeaderRows[i] = lines[i]; } // Parse 1: Iterate over each of the rows: for (int i = NoOfHeaderRows; i < lines.Length; i++) { string line = lines[i]; var chunks = TextHelper.SplitOnWhitespaceOr(line, FeatureDelimiter); // The first chunk contains the class: int j = 0; for (; j < Headers.Length; j++) { Headers[j][i - NoOfHeaderRows] = headerToHeaderIds[j][chunks[j]]; } // For each of the words in the document, ... var wordToWordCount = new Dictionary <int, int>(); for (; j < chunks.Length; j += 2) { int count = Int32.Parse(chunks[j + 1]); var featureId = featureToFeatureId[chunks[j]]; // Add this count to the existing sum: int sum; if (!wordToWordCount.TryGetValue(featureId, out sum)) { sum = 0; } wordToWordCount[featureId] = sum + count; } wordBags_i.Add(wordToWordCount); } // Parse 2: // This array is a matrix where each row represents a class and each column represents a word in our dictionary // (where the dictionary itself is a dictionary of ALL words in ALL classes). var vectors = new List <FeatureVector>(); for (int i = NoOfHeaderRows; i < lines.Length; i++) { var wordCounts = wordBags_i[i - NoOfHeaderRows]; var allFeatures = new ValueCollection(featureToFeatureId.Count); var usedFeatures = new int[wordCounts.Keys.Count]; int[] headers_j = new int[NoOfHeaderColumns]; for (int j = 0; j < NoOfHeaderColumns; j++) { headers_j[j] = Headers[j][i - NoOfHeaderRows]; } int w_i = 0; foreach (int f_i in wordCounts.Keys) { allFeatures[f_i] = GetFeatureValue(featureType, wordCounts[f_i]); usedFeatures[w_i++] = f_i; } vectors.Add(new FeatureVector(headers_j, allFeatures, usedFeatures, IsSortRequired)); } return(vectors); }
protected static void LoadModel(string text, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, out List <double> lambda_c, out List <FeatureVector> vectors) { var probability_c_uf = new Dictionary <int, Dictionary <int, double> >(); lambda_c = new List <double>(); int classId = -1; string className = null; Regex classNamePattern = new Regex(@"FEATURES FOR CLASS (?<className>.+)"); Regex featurePattern = new Regex(@"(?<feature>\S+)\s+(?<probability>.+)"); int lineNo = 0; foreach (var line in TextHelper.SplitOnNewline(text)) { lineNo++; Match match = classNamePattern.Match(line); // Branch A: Update the class name. if (match.Groups.Count > 1) { className = match.Groups["className"].Value; int newClassId = classToClassId[className]; // If the class changes, make sure that the dictionary for it exists. if (newClassId != classId) { if (probability_c_uf.ContainsKey(newClassId)) { Console.Error.WriteLine("Line {0}:\t Category {1} might be listed twice.", lineNo, className); } else { probability_c_uf[newClassId] = new Dictionary <int, double>(); } } classId = newClassId; } // Branch B: Add a new feature. else { Debug.Assert(classId != -1); Match featureMatch = featurePattern.Match(line); if (featureMatch.Groups.Count > 2) { string featureName = featureMatch.Groups["feature"].Value; double probability = double.Parse(featureMatch.Groups["probability"].Value); // Treat the default values slightly differently. if (featureName == "<default>") { Debug.Assert(classId == lambda_c.Count); lambda_c.Add(probability); } else { int featureId = featureToFeatureId[featureName]; // Check that the inner dictionary exists. if (probability_c_uf[classId].ContainsKey(featureId)) { Console.Error.WriteLine("Line {0}:\tFeature: {1} appears twice in category {2}.", lineNo, featureName, className); } probability_c_uf[classId][featureId] = probability; } } } } // Create feature vectors based on the information we've extracted. vectors = new List <FeatureVector>(); foreach (int c_i in probability_c_uf.Keys) { ValueCollection features = new ValueCollection(featureToFeatureId.Count); foreach (int usedFeatureId in probability_c_uf[c_i].Keys) { features[usedFeatureId] = probability_c_uf[c_i][usedFeatureId]; } FeatureVector vector = new FeatureVector(new int[] { c_i }, features, probability_c_uf[c_i].Keys.ToArray(), false); vectors.Add(vector); } }