/// <summary> /// Adds the appropriate features for the token at the specified index with the /// specified array of previous outcomes to the specified list of features. /// </summary> /// <param name="features">The list of features to be added to.</param> /// <param name="tokens">The tokens of the sentence or other text unit being processed.</param> /// <param name="index">The index of the token which is currently being processed.</param> /// <param name="previousOutcomes">The outcomes for the tokens prior to the specified index.</param> public override void CreateFeatures(List<string> features, string[] tokens, int index, string[] previousOutcomes) { var model = new NGramModel {{tokens[index], min, max}}; foreach (var token in model) { if (token.Count > 0) features.Add("ng=" + token[0].ToLowerInvariant()); } }
/// <summary> /// Builds the NGram dictionary with the given samples. /// </summary> /// <param name="samples">The samples.</param> /// <param name="cutoff">The cutoff.</param> /// <returns>The NGram dictionary.</returns> public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) { var model = new NGramModel(); POSSample sample; while ((sample = samples.Read()) != null) { if (sample.Sentence.Length > 0) { model.Add(new StringList(sample.Sentence), 1, 1); } } model.CutOff(cutoff, int.MaxValue); return model.ToDictionary(); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* if (chunks[ci].Parent == null) { chunks[ci].Show(); } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText; if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText; window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText; if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText; if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return dict.ToDictionary(true); }