/// <summary>
 /// Adds the appropriate features for the token at the specified index with the
 /// specified array of previous outcomes to the specified list of features.
 /// </summary>
 /// <param name="features">The list of features to be added to.</param>
 /// <param name="tokens">The tokens of the sentence or other text unit being processed.</param>
 /// <param name="index">The index of the token which is currently being processed.</param>
 /// <param name="previousOutcomes">The outcomes for the tokens prior to the specified index.</param>
 public override void CreateFeatures(List<string> features, string[] tokens, int index, string[] previousOutcomes) {
     var model = new NGramModel {{tokens[index], min, max}};
     foreach (var token in model) {
         if (token.Count > 0)
             features.Add("ng=" + token[0].ToLowerInvariant());
     }
 }
Esempio n. 2
0
        /// <summary>
        /// Builds the NGram dictionary with the given samples.
        /// </summary>
        /// <param name="samples">The samples.</param>
        /// <param name="cutoff">The cutoff.</param>
        /// <returns>The NGram dictionary.</returns>
        public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) {

            var model = new NGramModel();
            POSSample sample;

            while ((sample = samples.Read()) != null) {

                if (sample.Sentence.Length > 0) {
                    model.Add(new StringList(sample.Sentence), 1, 1);
                }

            }
            model.CutOff(cutoff, int.MaxValue);

            return model.ToDictionary();
        }
        /// <summary>
        /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off.
        /// </summary>
        /// <param name="data">The data stream of parses.</param>
        /// <param name="rules">The head rules for the parses.</param>
        /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param>
        /// <returns>A dictionary object.</returns>
        public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) {
            var cutoff = parameters.Get("dict", Parameters.Cutoff, 5);
            var dict = new NGramModel();

            Parse p;
            while ((p = data.Read()) != null) {
                p.UpdateHeads(rules);
                var pWords = p.GetTagNodes();
                var words = new string[pWords.Length];
                //add all uni-grams
                for (var wi = 0; wi < words.Length; wi++) {
                    words[wi] = pWords[wi].CoveredText;
                }

                dict.Add(new StringList(words), 1, 1);
                //add tri-grams and bi-grams for initial sequence
                var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags);
                var cWords = new string[chunks.Length];
                for (var wi = 0; wi < cWords.Length; wi++) {
                    cWords[wi] = chunks[wi].Head.CoveredText;
                }
                dict.Add(new StringList(cWords), 2, 3);

                //emulate reductions to produce additional n-grams
                var ci = 0;
                while (ci < chunks.Length) {
                    /*
                    if (chunks[ci].Parent == null) {
                        chunks[ci].Show();
                    } */
                    if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) {
                        //perform reduce
                        var reduceStart = ci;
                        while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) {
                            reduceStart--;
                        }
                        reduceStart++;
                        chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent);
                        ci = reduceStart;
                        if (chunks.Length != 0) {
                            var window = new string[5];
                            var wi = 0;
                            if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText;
                            if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText;
                            window[wi++] = chunks[ci].Head.CoveredText;
                            if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText;
                            if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText;
                            if (wi < 5) {
                                var subWindow = new string[wi];
                                for (var swi = 0; swi < wi; swi++) {
                                    subWindow[swi] = window[swi];
                                }
                                window = subWindow;
                            }
                            if (window.Length >= 3) {
                                dict.Add(new StringList(window), 2, 3);
                            } else if (window.Length == 2) {
                                dict.Add(new StringList(window), 2, 2);
                            }
                        }
                        ci = reduceStart - 1; //ci will be incremented at end of loop
                    }
                    ci++;
                }
            }
            dict.CutOff(cutoff, int.MaxValue);
            return dict.ToDictionary(true);
        }