public virtual ICounter <string> GetTopSpeakers(IList <Sieve.MentionData> closestMentions, IList <Sieve.MentionData> closestMentionsBackward, Person.Gender gender, ICoreMap quote, bool overrideGender) { ICounter <string> topSpeakerInRange = new ClassicCounter <string>(); ICounter <string> topSpeakerInRangeIgnoreGender = new ClassicCounter <string>(); ICollection <Sieve.MentionData> backwardsMentions = new HashSet <Sieve.MentionData>(closestMentionsBackward); foreach (Sieve.MentionData mention in closestMentions) { double weight = backwardsMentions.Contains(mention) ? BackwardWeight : ForwardWeight; if (mention.type.Equals(Name)) { if (!characterMap.Keys.Contains(mention.text)) { continue; } Person p = characterMap[mention.text][0]; if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " name"); } } else { if (mention.type.Equals(Pronoun)) { int charBeginKey = doc.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.begin].BeginPosition(); Person p = DoCoreference(charBeginKey, quote); if (p != null) { if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " pronoun"); } } } } } if (topSpeakerInRange.Size() > 0) { return(topSpeakerInRange); } else { if (gender != Person.Gender.Unk && !overrideGender) { return(topSpeakerInRange); } } return(topSpeakerInRangeIgnoreGender); }
public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords , double thresholdWordExtract) { IEnumerator <CandidatePhrase> termIter = Counters.ToPriorityQueue(newdt).GetEnumerator(); ICounter <CandidatePhrase> finalwords = new ClassicCounter <CandidatePhrase>(); while (termIter.MoveNext()) { if (finalwords.Size() >= constVars.numWordsToAdd) { break; } CandidatePhrase w = termIter.Current; if (newdt.GetCount(w) < thresholdWordExtract) { Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of " + thresholdWordExtract); break; } System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity)); if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) { Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet()); continue; } CandidatePhrase matchedFuzzy = null; if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null) { matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern); } if (matchedFuzzy == null) { Redwood.Log("extremePatDebug", "adding word " + w); finalwords.SetCount(w, newdt.GetCount(w)); } else { Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word"); ignoreWords.Add(w); } } string nextTen = string.Empty; int n = 0; while (termIter.MoveNext()) { n++; if (n > 10) { break; } CandidatePhrase w = termIter.Current; nextTen += ";\t" + w + ":" + newdt.GetCount(w); } Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen); return(finalwords); }
public virtual void TestSimplerTokens() { IDictionary <Type, string> prev = new _Dictionary_44(); IDictionary <Type, string> next = new _Dictionary_49(); PatternToken token = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext); IDictionary <Type, string> prev2 = new _Dictionary_58(); IDictionary <Type, string> next2 = new _Dictionary_63(); PatternToken token2 = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p2 = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext); System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0); ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>(); pats.SetCount(p, 1); pats.SetCount(p2, 1); System.Diagnostics.Debug.Assert(pats.Size() == 1); System.Console.Out.WriteLine("pats size is " + pats.Size()); ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>(); index.Add(p); index.Add(p2); System.Diagnostics.Debug.Assert(index.Count == 1); }
protected internal static Distribution <string> ComputeInputPrior(IDictionary <string, IList <IList <string> > > allTrainPaths) { ClassicCounter <string> result = new ClassicCounter <string>(); foreach (IList <IList <string> > pathList in allTrainPaths.Values) { foreach (IList <string> path in pathList) { foreach (string input in path) { result.IncrementCount(input); } } } return(Distribution.LaplaceSmoothedDistribution(result, result.Size() * 2, 0.5)); }
/// <summary>TODO(gabor) JavaDoc</summary> /// <param name="tokens"/> /// <param name="span"/> /// <returns/> public static string GuessNER(IList <CoreLabel> tokens, Span span) { ICounter <string> nerGuesses = new ClassicCounter <string>(); foreach (int i in span) { nerGuesses.IncrementCount(tokens[i].Ner()); } nerGuesses.Remove("O"); nerGuesses.Remove(null); if (nerGuesses.Size() > 0 && Counters.Max(nerGuesses) >= span.Size() / 2) { return(Counters.Argmax(nerGuesses)); } else { return("O"); } }
public static void PrintStats(ICollection <Tree> trees, PrintWriter pw) { ClassicCounter <int> wordLengthCounter = new ClassicCounter <int>(); ClassicCounter <TaggedWord> wordCounter = new ClassicCounter <TaggedWord>(); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); int counter = 0; foreach (Tree tree in trees) { counter++; IList <TaggedWord> taggedWords = tree.TaggedYield(); foreach (TaggedWord taggedWord in taggedWords) { string word = taggedWord.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } wordCounter.IncrementCount(taggedWord); wordLengthCounter.IncrementCount(int.Parse(word.Length)); for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletonChars = Counters.KeysBelow(charCounter, 1.5); ICollection <TaggedWord> singletonWords = Counters.KeysBelow(wordCounter, 1.5); ClassicCounter <string> singletonWordPOSes = new ClassicCounter <string>(); foreach (TaggedWord taggedWord_1 in singletonWords) { singletonWordPOSes.IncrementCount(taggedWord_1.Tag()); } Distribution <string> singletonWordPOSDist = Distribution.GetDistribution(singletonWordPOSes); ClassicCounter <char> singletonCharRads = new ClassicCounter <char>(); foreach (ChineseCharacterBasedLexicon.Symbol s in singletonChars) { singletonCharRads.IncrementCount(char.ValueOf(RadicalMap.GetRadical(s.GetCh()))); } Distribution <char> singletonCharRadDist = Distribution.GetDistribution(singletonCharRads); Distribution <int> wordLengthDist = Distribution.GetDistribution(wordLengthCounter); NumberFormat percent = new DecimalFormat("##.##%"); pw.Println("There are " + singletonChars.Count + " singleton chars out of " + (int)charCounter.TotalCount() + " tokens and " + charCounter.Size() + " types found in " + counter + " trees."); pw.Println("Thus singletonChars comprise " + percent.Format(singletonChars.Count / charCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonChars.Count / charCounter.Size()) + " of types."); pw.Println(); pw.Println("There are " + singletonWords.Count + " singleton words out of " + (int)wordCounter.TotalCount() + " tokens and " + wordCounter.Size() + " types."); pw.Println("Thus singletonWords comprise " + percent.Format(singletonWords.Count / wordCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonWords.Count / wordCounter.Size()) + " of types."); pw.Println(); pw.Println("Distribution over singleton word POS:"); pw.Println(singletonWordPOSDist.ToString()); pw.Println(); pw.Println("Distribution over singleton char radicals:"); pw.Println(singletonCharRadDist.ToString()); pw.Println(); pw.Println("Distribution over word length:"); pw.Println(wordLengthDist); }
public virtual void DumpSizes() { // System.out.println("core dep " + coreDependencies.size()); System.Console.Out.WriteLine("arg counter " + argCounter.Size()); System.Console.Out.WriteLine("stop counter " + stopCounter.Size()); }
/// <summary> /// Return various statistics about the treebank (number of sentences, /// words, tag set, etc.). /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack used to determine punctuation and an /// appropriate character encoding /// </param> /// <returns>A big string for human consumption describing the treebank</returns> public virtual string TextualSummary(ITreebankLanguagePack tlp) { int numTrees = 0; int numTreesLE40 = 0; int numNonUnaryRoots = 0; Tree nonUnaryEg = null; ClassicCounter <Tree> nonUnaries = new ClassicCounter <Tree>(); ClassicCounter <string> roots = new ClassicCounter <string>(); ClassicCounter <string> starts = new ClassicCounter <string>(); ClassicCounter <string> puncts = new ClassicCounter <string>(); int numUnenclosedLeaves = 0; int numLeaves = 0; int numNonPhrasal = 0; int numPreTerminalWithMultipleChildren = 0; int numWords = 0; int numTags = 0; int shortestSentence = int.MaxValue; int longestSentence = 0; int numNullLabel = 0; ICollection <string> words = Generics.NewHashSet(); ClassicCounter <string> tags = new ClassicCounter <string>(); ClassicCounter <string> cats = new ClassicCounter <string>(); Tree leafEg = null; Tree preTerminalMultipleChildrenEg = null; Tree nullLabelEg = null; Tree rootRewritesAsTaggedWordEg = null; foreach (Tree t in this) { roots.IncrementCount(t.Value()); numTrees++; int leng = t.Yield().Count; if (leng <= 40) { numTreesLE40++; } if (leng < shortestSentence) { shortestSentence = leng; } if (leng > longestSentence) { longestSentence = leng; } if (t.NumChildren() > 1) { if (numNonUnaryRoots == 0) { nonUnaryEg = t; } if (numNonUnaryRoots < 100) { nonUnaries.IncrementCount(t.LocalTree()); } numNonUnaryRoots++; } else { if (t.IsLeaf()) { numUnenclosedLeaves++; } else { Tree t2 = t.FirstChild(); if (t2.IsLeaf()) { numLeaves++; leafEg = t; } else { if (t2.IsPreTerminal()) { if (numNonPhrasal == 0) { rootRewritesAsTaggedWordEg = t; } numNonPhrasal++; } } starts.IncrementCount(t2.Value()); } } foreach (Tree subtree in t) { ILabel lab = subtree.Label(); if (lab == null || lab.Value() == null || lab.Value().IsEmpty()) { if (numNullLabel == 0) { nullLabelEg = subtree; } numNullLabel++; if (lab == null) { subtree.SetLabel(new StringLabel(string.Empty)); } else { if (lab.Value() == null) { subtree.Label().SetValue(string.Empty); } } } if (subtree.IsLeaf()) { numWords++; words.Add(subtree.Value()); } else { if (subtree.IsPreTerminal()) { numTags++; tags.IncrementCount(subtree.Value()); if (tlp != null && tlp.IsPunctuationTag(subtree.Value())) { puncts.IncrementCount(subtree.FirstChild().Value()); } } else { if (subtree.IsPhrasal()) { bool hasLeafChild = false; foreach (Tree kt in subtree.Children()) { if (kt.IsLeaf()) { hasLeafChild = true; } } if (hasLeafChild) { numPreTerminalWithMultipleChildren++; if (preTerminalMultipleChildrenEg == null) { preTerminalMultipleChildrenEg = subtree; } } cats.IncrementCount(subtree.Value()); } else { throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree); } } } } } StringWriter sw = new StringWriter(2000); PrintWriter pw = new PrintWriter(sw); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(0); pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)"); if (numTrees > 0) { if (numTags != numWords) { pw.Println(" Warning! numTags differs and is " + numTags); } if (roots.Size() == 1) { string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0]; pw.Println(" The root category is: " + root); } else { pw.Println(" Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf)); } if (numNonUnaryRoots > 0) { pw.Print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. "); if (numNonUnaryRoots > 100) { pw.Print("First 100 "); } pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf)); pw.Println(" Example: " + nonUnaryEg); } if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) { pw.Println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word"); if (numLeaves > 0) { pw.Println(" Example bad root rewrites as leaf: " + leafEg); } if (numNonPhrasal > 0) { pw.Println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg); } } if (numNullLabel > 0) { pw.Println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:"); pw.Println(" " + nullLabelEg); } if (numPreTerminalWithMultipleChildren > 0) { pw.Println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children."); pw.Println(" Example: " + preTerminalMultipleChildrenEg); } pw.Println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words."); pw.Println(" " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types"); string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" }; // What a dopey choice using 0 as an empty element name!! // The problem with the below is that words aren't turned into a basic // category, but empties commonly are indexed.... Would need to look // for them with a suffix of -[0-9]+ ICollection <string> knownEmpties = Generics.NewHashSet(Arrays.AsList(empties)); ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties); if (!emptiesIntersection.IsEmpty()) { pw.Println(" Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection); } ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet()); if (!joint.IsEmpty()) { pw.Println(" Warning! " + joint.Count + " items are tags and categories: " + joint); } foreach (string cat in cats.KeySet()) { if (cat != null && cat.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat); break; } } foreach (string cat_1 in tags.KeySet()) { if (cat_1 != null && cat_1.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat_1); break; } } pw.Println(" Cats: " + Counters.ToString(cats, nf)); pw.Println(" Tags: " + Counters.ToString(tags, nf)); pw.Println(" " + starts.Size() + " start categories: " + Counters.ToString(starts, nf)); if (!puncts.IsEmpty()) { pw.Println(" Puncts: " + Counters.ToString(puncts, nf)); } } return(sw.ToString()); }
public virtual void FinishTraining() { Timing.Tick("Counting characters..."); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); // first find all chars that occur only once foreach (IList <TaggedWord> labels in trainingSentences) { foreach (TaggedWord label in labels) { string word = label.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletons = Counters.KeysBelow(charCounter, 1.5); knownChars = Generics.NewHashSet(charCounter.KeySet()); Timing.Tick("Counting nGrams..."); GeneralizedCounter[] POSspecificCharNGrams = new GeneralizedCounter[ContextLength + 1]; for (int i = 0; i <= ContextLength; i++) { POSspecificCharNGrams[i] = new GeneralizedCounter(i + 2); } ClassicCounter <string> POSCounter = new ClassicCounter <string>(); IList <ISerializable> context = new List <ISerializable>(ContextLength + 1); foreach (IList <TaggedWord> words in trainingSentences) { foreach (TaggedWord taggedWord in words) { string word = taggedWord.Word(); string tag = taggedWord.Tag(); tagIndex.Add(tag); if (word.Equals(LexiconConstants.Boundary)) { continue; } POSCounter.IncrementCount(tag); for (int i_1 = 0; i_1 <= size; i_1++) { ChineseCharacterBasedLexicon.Symbol sym; ChineseCharacterBasedLexicon.Symbol unknownCharClass = null; context.Clear(); context.Add(tag); if (i_1 < size) { char thisCh = word[i_1]; sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(thisCh); if (singletons.Contains(sym)) { unknownCharClass = UnknownCharClass(sym); charCounter.IncrementCount(unknownCharClass); } } else { sym = ChineseCharacterBasedLexicon.Symbol.EndWord; } POSspecificCharNGrams[0].IncrementCount(context, sym); // POS-specific 1-gram if (unknownCharClass != null) { POSspecificCharNGrams[0].IncrementCount(context, unknownCharClass); } // for unknown ch model // context is constructed incrementally: // tag prevChar prevPrevChar // this could be made faster using .sublist like in score for (int j = 1; j <= ContextLength; j++) { // poly grams if (i_1 - j < 0) { context.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } // for unknown ch model break; } else { ChineseCharacterBasedLexicon.Symbol prev = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[i_1 - j]); if (singletons.Contains(prev)) { context.Add(UnknownCharClass(prev)); } else { context.Add(prev); } POSspecificCharNGrams[j].IncrementCount(context, sym); if (unknownCharClass != null) { POSspecificCharNGrams[j].IncrementCount(context, unknownCharClass); } } } } } } // for unknown ch model POSDistribution = Distribution.GetDistribution(POSCounter); Timing.Tick("Creating character prior distribution..."); charDistributions = Generics.NewHashMap(); // charDistributions = Generics.newHashMap(); // 1.5 // charCounter.incrementCount(Symbol.UNKNOWN, singletons.size()); int numberOfKeys = charCounter.Size() + singletons.Count; Distribution <ChineseCharacterBasedLexicon.Symbol> prior = Distribution.GoodTuringSmoothedCounter(charCounter, numberOfKeys); charDistributions[Java.Util.Collections.EmptyList] = prior; for (int i_2 = 0; i_2 <= ContextLength; i_2++) { ICollection <KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > > counterEntries = POSspecificCharNGrams[i_2].LowestLevelCounterEntrySet(); Timing.Tick("Creating " + counterEntries.Count + " character " + (i_2 + 1) + "-gram distributions..."); foreach (KeyValuePair <IList <ISerializable>, ClassicCounter <ChineseCharacterBasedLexicon.Symbol> > entry in counterEntries) { context = entry.Key; ClassicCounter <ChineseCharacterBasedLexicon.Symbol> c = entry.Value; Distribution <ChineseCharacterBasedLexicon.Symbol> thisPrior = charDistributions[context.SubList(0, context.Count - 1)]; double priorWeight = thisPrior.GetNumberOfKeys() / 200.0; Distribution <ChineseCharacterBasedLexicon.Symbol> newDist = Distribution.DynamicCounterWithDirichletPrior(c, thisPrior, priorWeight); charDistributions[context] = newDist; } } }