/// <summary> /// Builds the NGram dictionary with the given samples. /// </summary> /// <param name="samples">The samples.</param> /// <param name="cutoff">The cutoff.</param> /// <returns>The NGram dictionary.</returns> public static Dic BuildNGramDictionary(IObjectStream <POSSample> samples, int cutoff) { var model = new NGramModel(); POSSample sample; while ((sample = samples.Read()) != null) { if (sample.Sentence.Length > 0) { model.Add(new StringList(sample.Sentence), 1, 1); } } model.CutOff(cutoff, int.MaxValue); return(model.ToDictionary()); }
/// <summary> /// Builds the NGram dictionary with the given samples. /// </summary> /// <param name="samples">The samples.</param> /// <param name="cutoff">The cutoff.</param> /// <returns>The NGram dictionary.</returns> public static Dict BuildNGramDictionary(IObjectStream<POSSample> samples, int cutoff) { var model = new NGramModel(); POSSample sample; while ((sample = samples.Read()) != null) { if (sample.Sentence.Length > 0) { model.Add(new StringList(sample.Sentence), 1, 1); } } model.CutOff(cutoff, int.MaxValue); return model.ToDictionary(); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream <Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* * if (chunks[ci].Parent == null) { * chunks[ci].Show(); * } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) { window[wi++] = chunks[ci - 2].Head.CoveredText; } if (ci - 1 >= 0) { window[wi++] = chunks[ci - 1].Head.CoveredText; } window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) { window[wi++] = chunks[ci + 1].Head.CoveredText; } if (ci + 2 < chunks.Length) { window[wi++] = chunks[ci + 2].Head.CoveredText; } if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return(dict.ToDictionary(true)); }
/// <summary> /// Creates a n-gram dictionary from the specified data stream using the specified head rule and specified cut-off. /// </summary> /// <param name="data">The data stream of parses.</param> /// <param name="rules">The head rules for the parses.</param> /// <param name="parameters">Can contain a cutoff, the minimum number of entries required for the n-gram to be saved as part of the dictionary.</param> /// <returns>A dictionary object.</returns> public static Dic BuildDictionary(IObjectStream<Parse> data, AbstractHeadRules rules, TrainingParameters parameters) { var cutoff = parameters.Get("dict", Parameters.Cutoff, 5); var dict = new NGramModel(); Parse p; while ((p = data.Read()) != null) { p.UpdateHeads(rules); var pWords = p.GetTagNodes(); var words = new string[pWords.Length]; //add all uni-grams for (var wi = 0; wi < words.Length; wi++) { words[wi] = pWords[wi].CoveredText; } dict.Add(new StringList(words), 1, 1); //add tri-grams and bi-grams for initial sequence var chunks = CollapsePunctuation(AbstractParserEventStream.GetInitialChunks(p), rules.PunctuationTags); var cWords = new string[chunks.Length]; for (var wi = 0; wi < cWords.Length; wi++) { cWords[wi] = chunks[wi].Head.CoveredText; } dict.Add(new StringList(cWords), 2, 3); //emulate reductions to produce additional n-grams var ci = 0; while (ci < chunks.Length) { /* if (chunks[ci].Parent == null) { chunks[ci].Show(); } */ if (LastChild(chunks[ci], chunks[ci].Parent, rules.PunctuationTags)) { //perform reduce var reduceStart = ci; while (reduceStart >= 0 && Equals(chunks[reduceStart].Parent, chunks[ci].Parent)) { reduceStart--; } reduceStart++; chunks = ParserEventStream.ReduceChunks(chunks, ref ci, chunks[ci].Parent); ci = reduceStart; if (chunks.Length != 0) { var window = new string[5]; var wi = 0; if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].Head.CoveredText; if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].Head.CoveredText; window[wi++] = chunks[ci].Head.CoveredText; if (ci + 1 < chunks.Length) window[wi++] = chunks[ci + 1].Head.CoveredText; if (ci + 2 < chunks.Length) window[wi++] = chunks[ci + 2].Head.CoveredText; if (wi < 5) { var subWindow = new string[wi]; for (var swi = 0; swi < wi; swi++) { subWindow[swi] = window[swi]; } window = subWindow; } if (window.Length >= 3) { dict.Add(new StringList(window), 2, 3); } else if (window.Length == 2) { dict.Add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } dict.CutOff(cutoff, int.MaxValue); return dict.ToDictionary(true); }