/// <summary> /// Attempt to infer the part of speech of the given preterminal node, which /// was created during the expansion of a multi-word token. /// </summary> private static string InferPOS(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger) { string word = t.FirstChild().Value(); string containingPhraseStr = GetContainingPhrase(t, parent); // Overrides: let the manual POS model handle a few special cases first string overrideTag = MultiWordPreprocessor.ManualUWModel.GetOverrideTag(word, containingPhraseStr); if (overrideTag != null) { return(overrideTag); } ICollection <string> unigramTaggerKeys = unigramTagger.FirstKeySet(); // Try treating this word as a verb and stripping any clitic // pronouns. If the stripped version exists in the unigram // tagger, then stick with the verb hypothesis SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.SeparatePronouns(word); if (strippedVerb != null && unigramTaggerKeys.Contains(strippedVerb.GetStem())) { string pos = Counters.Argmax(unigramTagger.GetCounter(strippedVerb.GetStem())); if (pos.StartsWith("v")) { return(pos); } } if (unigramTagger.FirstKeySet().Contains(word)) { return(Counters.Argmax(unigramTagger.GetCounter(word), new MultiWordPreprocessor.POSTieBreaker())); } return(MultiWordPreprocessor.ManualUWModel.GetTag(word, containingPhraseStr)); }
private double NumNonRedundantPatterns(TwoDimensionalCounter <CandidatePhrase, E> terms, CandidatePhrase w) { object[] pats = Sharpen.Collections.ToArray(terms.GetCounter(w).KeySet()); int numPat = 0; for (int i = 0; i < pats.Length; i++) { //String pati = constVars.getPatternIndex().get(pats[i]).toString(); string pati = pats[i].ToString(); bool contains = false; for (int j = i + 1; j < pats.Length; j++) { //String patj = constVars.getPatternIndex().get(pats[j]).toString(); string patj = pats[j].ToString(); if (patj.Contains(pati) || pati.Contains(patj)) { contains = true; break; } } if (!contains) { numPat++; } } return(numPat); }
private static void CountTaggings(Treebank tb, PrintWriter pw) { TwoDimensionalCounter <string, string> wtc = new TwoDimensionalCounter <string, string>(); tb.Apply(null); foreach (string key in wtc.FirstKeySet()) { pw.Print(key); pw.Print('\t'); ICounter <string> ctr = wtc.GetCounter(key); foreach (string k2 in ctr.KeySet()) { pw.Print(k2 + '\t' + ctr.GetCount(k2) + '\t'); } pw.Println(); } }
public static void TraverseAndFix(Tree t, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger) { if (t.IsPreTerminal()) { if (t.Value().Equals(FrenchXMLTreeReader.MissingPos)) { nMissingPOS++; string word = t.FirstChild().Value(); string tag = (unigramTagger.FirstKeySet().Contains(word)) ? Counters.Argmax(unigramTagger.GetCounter(word)) : MWEPreprocessor.ManualUWModel.GetTag(word); t.SetValue(tag); } return; } foreach (Tree kid in t.Children()) { TraverseAndFix(kid, pretermLabel, unigramTagger); } //Post-order visit if (t.Value().Equals(FrenchXMLTreeReader.MissingPhrasal)) { nMissingPhrasal++; StringBuilder sb = new StringBuilder(); foreach (Tree kid_1 in t.Children()) { sb.Append(kid_1.Value()).Append(" "); } string posSequence = sb.ToString().Trim(); if (pretermLabel.FirstKeySet().Contains(posSequence)) { string phrasalCat = Counters.Argmax(pretermLabel.GetCounter(posSequence)); t.SetValue(phrasalCat); } else { System.Console.Out.WriteLine("No phrasal cat for: " + posSequence); } } }
public static void PrintCounter(TwoDimensionalCounter <string, string> cnt, string fname) { try { PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(fname)), false, "UTF-8")); foreach (string key in cnt.FirstKeySet()) { foreach (string val in cnt.GetCounter(key).KeySet()) { pw.Printf("%s\t%s\t%d%n", key, val, (int)cnt.GetCount(key, val)); } } pw.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords , double thresholdWordExtract) { IEnumerator <CandidatePhrase> termIter = Counters.ToPriorityQueue(newdt).GetEnumerator(); ICounter <CandidatePhrase> finalwords = new ClassicCounter <CandidatePhrase>(); while (termIter.MoveNext()) { if (finalwords.Size() >= constVars.numWordsToAdd) { break; } CandidatePhrase w = termIter.Current; if (newdt.GetCount(w) < thresholdWordExtract) { Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of " + thresholdWordExtract); break; } System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity)); if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) { Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet()); continue; } CandidatePhrase matchedFuzzy = null; if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null) { matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern); } if (matchedFuzzy == null) { Redwood.Log("extremePatDebug", "adding word " + w); finalwords.SetCount(w, newdt.GetCount(w)); } else { Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word"); ignoreWords.Add(w); } } string nextTen = string.Empty; int n = 0; while (termIter.MoveNext()) { n++; if (n > 10) { break; } CandidatePhrase w = termIter.Current; nextTen += ";\t" + w + ":" + newdt.GetCount(w); } Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen); return(finalwords); }
public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>(); ICollection <string> uniquePOSSequences = Generics.NewHashSet(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); TregexPattern pMWE = TregexPattern.Compile("/^MW/"); for (Tree t; (t = tr.ReadTree()) != null;) { //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); IList <CoreLabel> yield = match.TaggedLabeledYield(); StringBuilder termYield = new StringBuilder(); StringBuilder posYield = new StringBuilder(); foreach (CoreLabel cl in yield) { termYield.Append(cl.Word()).Append(" "); posYield.Append(cl.Tag()).Append(" "); } mweLabelToString.IncrementCount(label, termYield.ToString().Trim()); uniquePOSSequences.Add(posYield.ToString().Trim()); } } tr.Close(); //Closes the underlying reader System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n"); double nMWEs = mweLabelToString.TotalCount(); int nAllSingletons = 0; int nTokens = 0; foreach (string mweLabel in mweLabelToString.FirstKeySet()) { int nSingletons = 0; double totalCount = mweLabelToString.TotalCount(mweLabel); ICounter <string> mc = mweLabelToString.GetCounter(mweLabel); foreach (string term in mc.KeySet()) { if (mc.GetCount(term) == 1.0) { nSingletons++; } nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term); } nAllSingletons += nSingletons; System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs); } System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs); System.Console.Out.WriteLine("#tokens = " + nTokens); System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }