コード例 #1
0
        // Construction

        public FeatureVector(int[] headers, ValueCollection features, int[] usedFeatures, bool sortUsedFeatures)
        {
            Headers      = headers;
            Features     = features;
            UsedFeatures = usedFeatures;

            // Sometimes, it is preferable to have the features sorted. In these cases, sort the features.
            if (sortUsedFeatures)
            {
                SortHelper.QuickSort(UsedFeatures);
                isSorted = true;
            }

            // Optimization: The text representation and hash code are cached to speed up dictionary lookups etc.
            StringBuilder sb = new StringBuilder();

            sb.Append("{");
            bool isFirst = true;

            for (int w_i = 0; w_i < UsedFeatures.Length; w_i++)
            {
                if (isFirst)
                {
                    isFirst = false;
                }
                else
                {
                    sb.Append(", ");
                }
                int f_i = UsedFeatures[w_i];
                sb.AppendFormat("{0}:{1}", f_i, Features[f_i]);
            }
            sb.AppendLine("}");
            _text     = sb.ToString();
            _hashCode = _text.GetHashCode();
        }
コード例 #2
0
        /// <summary>Loads and returns a collection of FeatureVectors from the specified <c>uri</c>.</summary>
        /// <param name="uri">A file, storing the features in SVM format.</param>
        /// <param name="featureToFeatureId">
        /// A mapping between the feature's text values and internal numeric identifiers that represents these value.
        /// </param>
        /// <param name="classToClassId">
        /// A mapping between class's names and internal numeric identifiers that represents these class names.
        /// </param>
        /// <param name="transformationCount"></param>
        /// <returns></returns>
        public List <FeatureVector> LoadFromSVMLight(
            TextIdMapper featureToFeatureId
            , TextIdMapper[] headerToHeaderIds
            , FeatureType featureType)
        {
            Debug.Assert(headerToHeaderIds != null && headerToHeaderIds.Length == this.NoOfHeaderColumns);

            // Step 1: Read the data file:
            string[] lines = File.ReadAllLines(this.Path);

            var wordBags_i = new List <Dictionary <int, int> >();

            // Now that we know the number of lines, we can create the arrays for storing the header columns.
            for (int j = 0; j < Headers.Length; j++)
            {
                Headers[j] = new int[lines.Length];
                Debug.Assert(headerToHeaderIds[j] != null);
            }

            // Store the header rows:
            HeaderRows = new string[NoOfHeaderRows];
            for (int i = 0; i < NoOfHeaderRows; i++)
            {
                HeaderRows[i] = lines[i];
            }

            // Parse 1: Iterate over each of the rows:
            for (int i = NoOfHeaderRows; i < lines.Length; i++)
            {
                string line   = lines[i];
                var    chunks = TextHelper.SplitOnWhitespaceOr(line, FeatureDelimiter);

                // The first chunk contains the class:
                int j = 0;
                for (; j < Headers.Length; j++)
                {
                    Headers[j][i - NoOfHeaderRows] = headerToHeaderIds[j][chunks[j]];
                }

                // For each of the words in the document, ...
                var wordToWordCount = new Dictionary <int, int>();
                for (; j < chunks.Length; j += 2)
                {
                    int count     = Int32.Parse(chunks[j + 1]);
                    var featureId = featureToFeatureId[chunks[j]];
                    // Add this count to the existing sum:
                    int sum;
                    if (!wordToWordCount.TryGetValue(featureId, out sum))
                    {
                        sum = 0;
                    }
                    wordToWordCount[featureId] = sum + count;
                }
                wordBags_i.Add(wordToWordCount);
            }

            // Parse 2:
            // This array is a matrix where each row represents a class and each column represents a word in our dictionary
            // (where the dictionary itself is a dictionary of ALL words in ALL classes).
            var vectors = new List <FeatureVector>();

            for (int i = NoOfHeaderRows; i < lines.Length; i++)
            {
                var   wordCounts   = wordBags_i[i - NoOfHeaderRows];
                var   allFeatures  = new ValueCollection(featureToFeatureId.Count);
                var   usedFeatures = new int[wordCounts.Keys.Count];
                int[] headers_j    = new int[NoOfHeaderColumns];
                for (int j = 0; j < NoOfHeaderColumns; j++)
                {
                    headers_j[j] = Headers[j][i - NoOfHeaderRows];
                }
                int w_i = 0;
                foreach (int f_i in wordCounts.Keys)
                {
                    allFeatures[f_i]    = GetFeatureValue(featureType, wordCounts[f_i]);
                    usedFeatures[w_i++] = f_i;
                }
                vectors.Add(new FeatureVector(headers_j, allFeatures, usedFeatures, IsSortRequired));
            }
            return(vectors);
        }
コード例 #3
0
        protected static void LoadModel(string text, TextIdMapper classToClassId, TextIdMapper featureToFeatureId, out List <double> lambda_c, out List <FeatureVector> vectors)
        {
            var probability_c_uf = new Dictionary <int, Dictionary <int, double> >();

            lambda_c = new List <double>();
            int    classId          = -1;
            string className        = null;
            Regex  classNamePattern = new Regex(@"FEATURES FOR CLASS (?<className>.+)");
            Regex  featurePattern   = new Regex(@"(?<feature>\S+)\s+(?<probability>.+)");
            int    lineNo           = 0;

            foreach (var line in TextHelper.SplitOnNewline(text))
            {
                lineNo++;
                Match match = classNamePattern.Match(line);
                // Branch A: Update the class name.
                if (match.Groups.Count > 1)
                {
                    className = match.Groups["className"].Value;
                    int newClassId = classToClassId[className];

                    // If the class changes, make sure that the dictionary for it exists.
                    if (newClassId != classId)
                    {
                        if (probability_c_uf.ContainsKey(newClassId))
                        {
                            Console.Error.WriteLine("Line {0}:\t Category {1} might be listed twice.", lineNo, className);
                        }
                        else
                        {
                            probability_c_uf[newClassId] = new Dictionary <int, double>();
                        }
                    }
                    classId = newClassId;
                }
                // Branch B: Add a new feature.
                else
                {
                    Debug.Assert(classId != -1);
                    Match featureMatch = featurePattern.Match(line);
                    if (featureMatch.Groups.Count > 2)
                    {
                        string featureName = featureMatch.Groups["feature"].Value;
                        double probability = double.Parse(featureMatch.Groups["probability"].Value);

                        // Treat the default values slightly differently.
                        if (featureName == "<default>")
                        {
                            Debug.Assert(classId == lambda_c.Count);
                            lambda_c.Add(probability);
                        }
                        else
                        {
                            int featureId = featureToFeatureId[featureName];
                            // Check that the inner dictionary exists.
                            if (probability_c_uf[classId].ContainsKey(featureId))
                            {
                                Console.Error.WriteLine("Line {0}:\tFeature: {1} appears twice in category {2}.", lineNo, featureName, className);
                            }
                            probability_c_uf[classId][featureId] = probability;
                        }
                    }
                }
            }

            // Create feature vectors based on the information we've extracted.
            vectors = new List <FeatureVector>();
            foreach (int c_i in probability_c_uf.Keys)
            {
                ValueCollection features = new ValueCollection(featureToFeatureId.Count);
                foreach (int usedFeatureId in probability_c_uf[c_i].Keys)
                {
                    features[usedFeatureId] = probability_c_uf[c_i][usedFeatureId];
                }
                FeatureVector vector = new FeatureVector(new int[] { c_i }, features, probability_c_uf[c_i].Keys.ToArray(), false);
                vectors.Add(vector);
            }
        }