/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary> /// <param name="datum">with a collection of features.</param> /// <param name="featureDocCounts">a counter of doc-count for each feature.</param> /// <returns>RVFDatum with l1-normalized tf-idf features.</returns> public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts) { ICounter <F> tfidfFeatures = new ClassicCounter <F>(); foreach (F feature in datum.AsFeatures()) { if (featureDocCounts.ContainsKey(feature)) { tfidfFeatures.IncrementCount(feature, 1.0); } } double l1norm = 0; foreach (F feature_1 in tfidfFeatures.KeySet()) { double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5)); double tf = tfidfFeatures.GetCount(feature_1); tfidfFeatures.SetCount(feature_1, tf * idf); l1norm += tf * idf; } foreach (F feature_2 in tfidfFeatures.KeySet()) { double tfidf = tfidfFeatures.GetCount(feature_2); tfidfFeatures.SetCount(feature_2, tfidf / l1norm); } RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label()); return(rvfDatum); }
public virtual void FinishTraining() { // testing: get some stats here log.Info("Total tokens: " + tokens); log.Info("Total WordTag types: " + wtCount.KeySet().Count); log.Info("Total tag types: " + tagCount.KeySet().Count); log.Info("Total word types: " + seenWords.Count); /* find # of once-seen words for each tag */ foreach (Pair <string, string> wt in wtCount.KeySet()) { if (wtCount.GetCount(wt) == 1) { r1.IncrementCount(wt.Second()); } } /* find # of unseen words for each tag */ foreach (string tag in tagCount.KeySet()) { foreach (string word in seenWords) { Pair <string, string> wt_1 = new Pair <string, string>(word, tag); if (!(wtCount.KeySet().Contains(wt_1))) { r0.IncrementCount(tag); } } } /* set unseen word probability for each tag */ foreach (string tag_1 in tagCount.KeySet()) { float logprob = (float)Math.Log(r1.GetCount(tag_1) / (tagCount.GetCount(tag_1) * r0.GetCount(tag_1))); unknownGT[tag_1] = float.ValueOf(logprob); } }
protected internal virtual void InitRulesWithWord() { if (testOptions.verbose || DebugLexicon) { log.Info("Initializing lexicon scores ... "); } // int numWords = words.size()+sigs.size()+1; int unkWord = wordIndex.AddToIndex(LexiconConstants.UnknownWord); int numWords = wordIndex.Size(); rulesWithWord = new IList[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new List <IntTaggedWord>(1); } // most have 1 or 2 // items in them // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) { tags = Generics.NewHashSet(); foreach (IntTaggedWord iTW in seenCounter.KeySet()) { if (iTW.Word() == nullWord && iTW.Tag() != nullTag) { tags.Add(iTW); } } // tags for unknown words foreach (IntTaggedWord iT in tags) { double types = uwModel.UnSeenCounter().GetCount(iT); if (types > trainOptions.openClassTypesThreshold) { // Number of types before it's treated as open class IntTaggedWord iTW_1 = new IntTaggedWord(unkWord, iT.tag); rulesWithWord[iTW_1.word].Add(iTW_1); } } if (testOptions.verbose || DebugLexicon) { StringBuilder sb = new StringBuilder(); sb.Append("The ").Append(rulesWithWord[unkWord].Count).Append(" open class tags are: ["); foreach (IntTaggedWord item in rulesWithWord[unkWord]) { sb.Append(' ').Append(tagIndex.Get(item.Tag())); } sb.Append(" ]"); log.Info(sb.ToString()); } foreach (IntTaggedWord iTW_2 in seenCounter.KeySet()) { if (iTW_2.Tag() != nullTag && iTW_2.Word() != nullWord) { rulesWithWord[iTW_2.word].Add(iTW_2); } } }
/// <summary>Need to sort the counter by feature keys and dump it</summary> public static void PrintSVMLightFormat(PrintWriter pw, ClassicCounter <int> c, int classNo) { int[] features = Sharpen.Collections.ToArray(c.KeySet(), new int[c.KeySet().Count]); Arrays.Sort(features); StringBuilder sb = new StringBuilder(); sb.Append(classNo); sb.Append(' '); foreach (int f in features) { sb.Append(f + 1).Append(':').Append(c.GetCount(f)).Append(' '); } pw.Println(sb.ToString()); }
public override IUnknownWordModel FinishTraining() { if (useGT) { unknownGTTrainer.FinishTraining(); } foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c) { /* outer iteration is over tags */ ILabel key = entry.Key; ClassicCounter <string> wc = entry.Value; // counts for words given a tag if (!tagHash.Contains(key)) { tagHash[key] = new ClassicCounter <string>(); } /* the UNKNOWN sequence is assumed to be seen once in each tag */ // This is sort of broken, but you can regard it as a Dirichlet prior. tc.IncrementCount(key); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); /* inner iteration is over words */ foreach (string end in wc.KeySet()) { double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key))); // p(sig|tag) tagHash[key].SetCount(end, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding); return(model); }
/// <exception cref="System.IO.IOException"/> private void WriteObject(ObjectOutputStream stream) { // log.info("\nBefore compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> fullArgCounter = argCounter; argCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency in fullArgCounter.KeySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency)); } } ClassicCounter <IntDependency> fullStopCounter = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency_1 in fullStopCounter.KeySet()) { if (dependency_1.head.word != -1) { stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1)); } } // log.info("After compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); stream.DefaultWriteObject(); argCounter = fullArgCounter; stopCounter = fullStopCounter; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private void ReadObject(ObjectInputStream stream) { stream.DefaultReadObject(); // log.info("Before decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> compressedArgC = argCounter; argCounter = new ClassicCounter <IntDependency>(); ClassicCounter <IntDependency> compressedStopC = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency d in compressedArgC.KeySet()) { double count = compressedArgC.GetCount(d); ExpandArg(d, d.distance, count); } foreach (IntDependency d_1 in compressedStopC.KeySet()) { double count = compressedStopC.GetCount(d_1); ExpandStop(d_1, d_1.distance, count, false); } // log.info("After decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); expandDependencyMap = null; }
public override IUnknownWordModel FinishTraining() { // Map<String,Float> unknownGT = null; if (useGT) { unknownGTTrainer.FinishTraining(); } // unknownGT = unknownGTTrainer.unknownGT; foreach (ILabel tagLab in c.Keys) { // outer iteration is over tags as Labels ClassicCounter <string> wc = c[tagLab]; // counts for words given a tag if (!tagHash.Contains(tagLab)) { tagHash[tagLab] = new ClassicCounter <string>(); } // the UNKNOWN first character is assumed to be seen once in // each tag // this is really sort of broken! (why??) tc.IncrementCount(tagLab); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); // inner iteration is over words as strings foreach (string first in wc.KeySet()) { double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab)); tagHash[tagLab].SetCount(first, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding); return(model); }
public virtual void ClassifyMentions(IList <IList <Mention> > predictedMentions, Dictionaries dict, Properties props) { ICollection <string> neStrings = Generics.NewHashSet(); foreach (IList <Mention> predictedMention in predictedMentions) { foreach (Mention m in predictedMention) { string ne = m.headWord.Ner(); if (ne.Equals("O")) { continue; } foreach (CoreLabel cl in m.originalSpan) { if (!cl.Ner().Equals(ne)) { continue; } } neStrings.Add(m.LowercaseNormalizedSpanString()); } } foreach (IList <Mention> predicts in predictedMentions) { IDictionary <int, ICollection <Mention> > headPositions = Generics.NewHashMap(); foreach (Mention p in predicts) { if (!headPositions.Contains(p.headIndex)) { headPositions[p.headIndex] = Generics.NewHashSet(); } headPositions[p.headIndex].Add(p); } ICollection <Mention> remove = Generics.NewHashSet(); foreach (int hPos in headPositions.Keys) { ICollection <Mention> shares = headPositions[hPos]; if (shares.Count > 1) { ICounter <Mention> probs = new ClassicCounter <Mention>(); foreach (Mention p_1 in shares) { double trueProb = ProbabilityOf(p_1, shares, neStrings, dict, props); probs.IncrementCount(p_1, trueProb); } // add to remove Mention keep = Counters.Argmax(probs, null); probs.Remove(keep); Sharpen.Collections.AddAll(remove, probs.KeySet()); } } foreach (Mention r in remove) { predicts.Remove(r); } } }
/// <summary>Returns a list of all modes in the Collection.</summary> /// <remarks> /// Returns a list of all modes in the Collection. (If the Collection has multiple items with the /// highest frequency, all of them will be returned.) /// </remarks> public static ICollection <T> Modes <T>(ICollection <T> values) { ICounter <T> counter = new ClassicCounter <T>(values); IList <double> sortedCounts = Edu.Stanford.Nlp.Util.CollectionUtils.Sorted(counter.Values()); double highestCount = sortedCounts[sortedCounts.Count - 1]; Counters.RetainAbove(counter, highestCount); return(counter.KeySet()); }
private static void Display <T>(ClassicCounter <T> c, PrintWriter pw) { IList <T> cats = new List <T>(c.KeySet()); cats.Sort(Counters.ToComparatorDescending(c)); foreach (T ob in cats) { pw.Println(ob + " " + c.GetCount(ob)); } }
public virtual void RunCoref(Document document) { Compressor <string> compressor = new Compressor <string>(); if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>(); foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch)) { foreach (int m1 in e.Value) { pairs[new Pair <int, int>(m1, e.Key)] = true; } } DocumentExamples examples = extractor.Extract(0, document, pairs, compressor); ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >(); foreach (Example mentionPair in examples.examples) { if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor)); } IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet()); mentionPairs.Sort(null); ICollection <int> seenAnaphors = new HashSet <int>(); foreach (Pair <int, int> pair in mentionPairs) { if (seenAnaphors.Contains(pair.second)) { continue; } if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } seenAnaphors.Add(pair.second); Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType; Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType; if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)]) { CorefUtils.MergeCoreferenceClusters(pair, document); } } }
public override IDependencyGrammar FormResult() { wordIndex.AddToIndex(LexiconConstants.UnknownWord); MLEDependencyGrammar dg = new MLEDependencyGrammar(tlpParams, directional, useDistance, useCoarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex); foreach (IntDependency dependency in dependencyCounter.KeySet()) { dg.AddRule(dependency, dependencyCounter.GetCount(dependency)); } return(dg); }
public virtual void DumpStats() { System.Console.Out.WriteLine("%% Counts of nonterminals:"); IList <string> biggestCounts = new List <string>(nonTerms.KeySet()); biggestCounts.Sort(Counters.ToComparatorDescending(nonTerms)); foreach (string str in biggestCounts) { System.Console.Out.WriteLine(str + ": " + nonTerms.GetCount(str)); } }
public static TransducerGraph CreateGraphFromPaths <T>(ClassicCounter <IList <T> > pathCounter, int markovOrder) { TransducerGraph graph = new TransducerGraph(); // empty foreach (IList <T> path in pathCounter.KeySet()) { double count = pathCounter.GetCount(path); AddOnePathToGraph(path, count, markovOrder, graph); } return(graph); }
/// <summary> /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter /// using the actual features and labels. /// </summary> /// <remarks> /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter /// using the actual features and labels. Because this is svm_light, and not svm_struct, the /// weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class /// (which correspond to labelIndex.get(1)) are just the negation of one another. /// </remarks> private ClassicCounter <Pair <F, L> > ConvertSVMLightWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex) { ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >(); foreach (int i in weights.KeySet()) { F f = featureIndex.Get(i - 1); double w = weights.GetCount(i); // the first guy in the labelIndex was the +1 class and the second guy // was the -1 class newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(0)), w); newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(1)), -w); } return(newWeights); }
/// <summary> /// Converts the svm_struct weight Counter (in which the weight for a feature/label pair /// correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter /// using the actual features and labels. /// </summary> private ClassicCounter <Pair <F, L> > ConvertSVMStructWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex) { // int numLabels = labelIndex.size(); int numFeatures = featureIndex.Size(); ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >(); foreach (int i in weights.KeySet()) { L l = labelIndex.Get((i - 1) / numFeatures); // integer division on purpose F f = featureIndex.Get((i - 1) % numFeatures); double w = weights.GetCount(i); newWeights.IncrementCount(new Pair <F, L>(f, l), w); } return(newWeights); }
private static void Display <T>(ClassicCounter <T> c, int num, PrintWriter pw) { IList <T> rules = new List <T>(c.KeySet()); rules.Sort(Counters.ToComparatorDescending(c)); int rSize = rules.Count; if (num > rSize) { num = rSize; } for (int i = 0; i < num; i++) { pw.Println(rules[i] + " " + c.GetCount(rules[i])); } }
// todo: Fix javadoc, have unit tested /// <summary>Print SVM Light Format file.</summary> /// <remarks> /// Print SVM Light Format file. /// The following comments are no longer applicable because I am /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009. /// If the Dataset has more than 2 classes, then it /// prints using the label index (+1) (for svm_struct). If it is 2 classes, then the labelIndex.get(0) /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light). /// </remarks> public virtual void PrintSVMLightFormat(PrintWriter pw) { //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter // old comment: // the following code commented out by Ramesh ([email protected]) 12/17/2009. // why not simply print the exact id of the label instead of mapping to some values?? // new comment: // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels, // e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010) string[] labelMap = MakeSvmLabelMap(); for (int i = 0; i < size; i++) { RVFDatum <L, F> d = GetRVFDatum(i); ICounter <F> c = d.AsFeaturesCounter(); ClassicCounter <int> printC = new ClassicCounter <int>(); foreach (F f in c.KeySet()) { printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f)); } int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]); Arrays.Sort(features); StringBuilder sb = new StringBuilder(); sb.Append(labelMap[labels[i]]).Append(' '); // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions! /* Old code: assumes that F is Integer.... * * for (int f: features) { * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" "); * } */ //I think this is what was meant (using printC rather than c), but not sure // ~Sarah Spikes ([email protected]) foreach (int f_1 in features) { sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' '); } pw.Println(sb.ToString()); } }
private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, IList <Pair <int, int> > mentionPairs, ICounter <Pair <int, int> > scores) { ICounter <string> features = new ClassicCounter <string>(); double maxScore = 0; double minScore = 1; ICounter <string> totals = new ClassicCounter <string>(); ICounter <string> totalsLog = new ClassicCounter <string>(); ICounter <string> counts = new ClassicCounter <string>(); foreach (Pair <int, int> mentionPair in mentionPairs) { if (!scores.ContainsKey(mentionPair)) { mentionPair = new Pair <int, int>(mentionPair.second, mentionPair.first); } double score = scores.GetCount(mentionPair); double logScore = CappedLog(score); string mt1 = doc.mentionTypes[mentionPair.first]; string mt2 = doc.mentionTypes[mentionPair.second]; mt1 = mt1.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL"; mt2 = mt2.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL"; string conj = "_" + mt1 + "_" + mt2; maxScore = Math.Max(maxScore, score); minScore = Math.Min(minScore, score); totals.IncrementCount(string.Empty, score); totalsLog.IncrementCount(string.Empty, logScore); counts.IncrementCount(string.Empty); totals.IncrementCount(conj, score); totalsLog.IncrementCount(conj, logScore); counts.IncrementCount(conj); } features.IncrementCount("max", maxScore); features.IncrementCount("min", minScore); foreach (string key in counts.KeySet()) { features.IncrementCount("avg" + key, totals.GetCount(key) / mentionPairs.Count); features.IncrementCount("avgLog" + key, totalsLog.GetCount(key) / mentionPairs.Count); } return(features); }
/// <summary>Writes out data from this Object to the Writer w.</summary> /// <exception cref="System.IO.IOException"/> public override void WriteData(PrintWriter @out) { // all lines have one rule per line foreach (IntDependency dependency in argCounter.KeySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { double count = argCounter.GetCount(dependency); @out.Println(dependency.ToString(wordIndex, tagIndex) + " " + count); } } @out.Println("BEGIN_STOP"); foreach (IntDependency dependency_1 in stopCounter.KeySet()) { if (dependency_1.head.word != -1) { double count = stopCounter.GetCount(dependency_1); @out.Println(dependency_1.ToString(wordIndex, tagIndex) + " " + count); } } @out.Flush(); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; Language lang = Language.English; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i]); } } PrintWriter pw = tlpp.Pw(); Options op = new Options(); Options.LexOptions lexOptions = op.lexOptions; if (lang == Language.French) { lexOptions.useUnknownWordSignatures = 1; lexOptions.smartMutation = false; lexOptions.unknownSuffixSize = 2; lexOptions.unknownPrefixSize = 1; } else { if (lang == Language.Arabic) { lexOptions.smartMutation = false; lexOptions.useUnknownWordSignatures = 9; lexOptions.unknownPrefixSize = 1; lexOptions.unknownSuffixSize = 1; } } IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); ILexicon lex = tlpp.Lex(op, wordIndex, tagIndex); int computeAfter = (int)(0.50 * tb.Count); ICounter <string> vocab = new ClassicCounter <string>(); ICounter <string> unkCounter = new ClassicCounter <string>(); int treeId = 0; foreach (Tree t in tb) { IList <ILabel> yield = t.Yield(); int posId = 0; foreach (ILabel word in yield) { vocab.IncrementCount(word.Value()); if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0) { // if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK")) // pw.println(word.value()); unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++)); } } treeId++; } IList <string> biggestKeys = new List <string>(unkCounter.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter)); foreach (string wordType in biggestKeys) { pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType)); } pw.Close(); pw.Close(); }
/// <summary> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. /// </summary> /// <remarks> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. In other words, the evaluation file has the /// same format as the training data. /// </remarks> /// <param name="pwOut"/> private void Evaluate(PrintWriter pwOut) { log.Info("Starting evaluation..."); bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.testFile, docReader); PrintWriter tedEvalGoldTree = null; PrintWriter tedEvalParseTree = null; PrintWriter tedEvalGoldSeg = null; PrintWriter tedEvalParseSeg = null; if (tedEvalPrefix != null) { try { tedEvalGoldTree = new PrintWriter(tedEvalPrefix + "_gold.ftree"); tedEvalGoldSeg = new PrintWriter(tedEvalPrefix + "_gold.segmentation"); tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree"); tedEvalParseSeg = new PrintWriter(tedEvalPrefix + "_parse.segmentation"); } catch (FileNotFoundException e) { System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message); } } ICounter <string> labelTotal = new ClassicCounter <string>(); ICounter <string> labelCorrect = new ClassicCounter <string>(); int total = 0; int correct = 0; foreach (IList <CoreLabel> line in lines) { string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" "); string[] goldTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); line = classifier.Classify(line); string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); foreach (CoreLabel label in line) { // Do not evaluate labeling of whitespace string observation = label.Get(typeof(CoreAnnotations.CharAnnotation)); if (!observation.Equals(IOBUtils.GetBoundaryCharacter())) { total++; string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation)); string reference = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)); labelTotal.IncrementCount(reference); if (hypothesis.Equals(reference)) { correct++; labelCorrect.IncrementCount(reference); } } } if (tedEvalParseSeg != null) { tedEvalGoldTree.Printf("(root"); tedEvalParseTree.Printf("(root"); int safeLength = inputTokens.Length; if (inputTokens.Length != goldTokens.Length) { log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" gold: " + Arrays.ToString(goldTokens)); safeLength = Math.Min(inputTokens.Length, goldTokens.Length); } if (inputTokens.Length != parseTokens.Length) { log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" parse: " + Arrays.ToString(parseTokens)); safeLength = Math.Min(inputTokens.Length, parseTokens.Length); } for (int i = 0; i < safeLength; i++) { foreach (string segment in goldTokens[i].Split(":")) { tedEvalGoldTree.Printf(" (seg %s)", segment); } tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]); foreach (string segment_1 in parseTokens[i].Split(":")) { tedEvalParseTree.Printf(" (seg %s)", segment_1); } tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]); } tedEvalGoldTree.Printf(")%n"); tedEvalGoldSeg.Println(); tedEvalParseTree.Printf(")%n"); tedEvalParseSeg.Println(); } } double accuracy = ((double)correct) / ((double)total); accuracy *= 100.0; pwOut.Println("EVALUATION RESULTS"); pwOut.Printf("#datums:\t%d%n", total); pwOut.Printf("#correct:\t%d%n", correct); pwOut.Printf("accuracy:\t%.2f%n", accuracy); pwOut.Println("=================="); // Output the per label accuracies pwOut.Println("PER LABEL ACCURACIES"); foreach (string refLabel in labelTotal.KeySet()) { double nTotal = labelTotal.GetCount(refLabel); double nCorrect = labelCorrect.GetCount(refLabel); double acc = (nCorrect / nTotal) * 100.0; pwOut.Printf(" %s\t%.2f%n", refLabel, acc); } if (tedEvalParseSeg != null) { tedEvalGoldTree.Close(); tedEvalGoldSeg.Close(); tedEvalParseTree.Close(); tedEvalParseSeg.Close(); } }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; TregexPattern rootMatch = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { rootMatch = TregexPattern.Compile("@" + args[i++]); if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i++]); } } ICounter <string> rhsCounter = new ClassicCounter <string>(); foreach (Tree t in tb) { TregexMatcher m = rootMatch.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); StringBuilder sb = new StringBuilder(); foreach (Tree kid in match.Children()) { sb.Append(kid.Value()).Append(" "); } rhsCounter.IncrementCount(sb.ToString().Trim()); } } IList <string> biggestKeys = new List <string>(rhsCounter.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(rhsCounter)); PrintWriter pw = tlpp.Pw(); foreach (string rhs in biggestKeys) { pw.Printf("%s\t%d%n", rhs, (int)rhsCounter.GetCount(rhs)); } pw.Close(); }
public override void PrintResults(PrintWriter pw, IList <ICoreMap> goldStandard, IList <ICoreMap> extractorOutput) { ResultsPrinter.Align(goldStandard, extractorOutput); ICounter <string> correct = new ClassicCounter <string>(); ICounter <string> predicted = new ClassicCounter <string>(); ICounter <string> gold = new ClassicCounter <string>(); for (int i = 0; i < goldStandard.Count; i++) { ICoreMap goldSent = goldStandard[i]; ICoreMap sysSent = extractorOutput[i]; string sysText = sysSent.Get(typeof(CoreAnnotations.TextAnnotation)); string goldText = goldSent.Get(typeof(CoreAnnotations.TextAnnotation)); if (verbose) { log.Info("SCORING THE FOLLOWING SENTENCE:"); log.Info(sysSent.Get(typeof(CoreAnnotations.TokensAnnotation))); } HashSet <string> matchedGolds = new HashSet <string>(); IList <EntityMention> goldEntities = goldSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (goldEntities == null) { goldEntities = new List <EntityMention>(); } foreach (EntityMention m in goldEntities) { string label = MakeLabel(m); if (excludedClasses != null && excludedClasses.Contains(label)) { continue; } gold.IncrementCount(label); } IList <EntityMention> sysEntities = sysSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (sysEntities == null) { sysEntities = new List <EntityMention>(); } foreach (EntityMention m_1 in sysEntities) { string label = MakeLabel(m_1); if (excludedClasses != null && excludedClasses.Contains(label)) { continue; } predicted.IncrementCount(label); if (verbose) { log.Info("COMPARING PREDICTED MENTION: " + m_1); } bool found = false; foreach (EntityMention gm in goldEntities) { if (matchedGolds.Contains(gm.GetObjectId())) { continue; } if (verbose) { log.Info("\tagainst: " + gm); } if (gm.Equals(m_1, useSubTypes)) { if (verbose) { log.Info("\t\t\tMATCH!"); } found = true; matchedGolds.Add(gm.GetObjectId()); if (verboseInstances) { log.Info("TRUE POSITIVE: " + m_1 + " matched " + gm); log.Info("In sentence: " + sysText); } break; } } if (found) { correct.IncrementCount(label); } else { if (verboseInstances) { log.Info("FALSE POSITIVE: " + m_1.ToString()); log.Info("In sentence: " + sysText); } } } if (verboseInstances) { foreach (EntityMention m_2 in goldEntities) { string label = MakeLabel(m_2); if (!matchedGolds.Contains(m_2.GetObjectId()) && (excludedClasses == null || !excludedClasses.Contains(label))) { log.Info("FALSE NEGATIVE: " + m_2.ToString()); log.Info("In sentence: " + goldText); } } } } double totalCount = 0; double totalCorrect = 0; double totalPredicted = 0; pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF"); IList <string> labels = new List <string>(gold.KeySet()); labels.Sort(); foreach (string label_1 in labels) { if (excludedClasses != null && excludedClasses.Contains(label_1)) { continue; } double numCorrect = correct.GetCount(label_1); double numPredicted = predicted.GetCount(label_1); double trueCount = gold.GetCount(label_1); double precision = (numPredicted > 0) ? (numCorrect / numPredicted) : 0; double recall = numCorrect / trueCount; double f = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0; pw.Println(StringUtils.PadOrTrim(label_1, 21) + "\t" + numCorrect + "\t" + numPredicted + "\t" + trueCount + "\t" + Formatter.Format(precision * 100) + "\t" + Formatter.Format(100 * recall) + "\t" + Formatter.Format(100 * f)); totalCount += trueCount; totalCorrect += numCorrect; totalPredicted += numPredicted; } double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0; double recall_1 = totalCorrect / totalCount; double f_1 = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0; pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + Formatter.Format(100 * precision_1) + "\t" + Formatter.Format(100 * recall_1) + "\t" + Formatter.Format(100 * f_1)); }
/// <summary> /// Return various statistics about the treebank (number of sentences, /// words, tag set, etc.). /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack used to determine punctuation and an /// appropriate character encoding /// </param> /// <returns>A big string for human consumption describing the treebank</returns> public virtual string TextualSummary(ITreebankLanguagePack tlp) { int numTrees = 0; int numTreesLE40 = 0; int numNonUnaryRoots = 0; Tree nonUnaryEg = null; ClassicCounter <Tree> nonUnaries = new ClassicCounter <Tree>(); ClassicCounter <string> roots = new ClassicCounter <string>(); ClassicCounter <string> starts = new ClassicCounter <string>(); ClassicCounter <string> puncts = new ClassicCounter <string>(); int numUnenclosedLeaves = 0; int numLeaves = 0; int numNonPhrasal = 0; int numPreTerminalWithMultipleChildren = 0; int numWords = 0; int numTags = 0; int shortestSentence = int.MaxValue; int longestSentence = 0; int numNullLabel = 0; ICollection <string> words = Generics.NewHashSet(); ClassicCounter <string> tags = new ClassicCounter <string>(); ClassicCounter <string> cats = new ClassicCounter <string>(); Tree leafEg = null; Tree preTerminalMultipleChildrenEg = null; Tree nullLabelEg = null; Tree rootRewritesAsTaggedWordEg = null; foreach (Tree t in this) { roots.IncrementCount(t.Value()); numTrees++; int leng = t.Yield().Count; if (leng <= 40) { numTreesLE40++; } if (leng < shortestSentence) { shortestSentence = leng; } if (leng > longestSentence) { longestSentence = leng; } if (t.NumChildren() > 1) { if (numNonUnaryRoots == 0) { nonUnaryEg = t; } if (numNonUnaryRoots < 100) { nonUnaries.IncrementCount(t.LocalTree()); } numNonUnaryRoots++; } else { if (t.IsLeaf()) { numUnenclosedLeaves++; } else { Tree t2 = t.FirstChild(); if (t2.IsLeaf()) { numLeaves++; leafEg = t; } else { if (t2.IsPreTerminal()) { if (numNonPhrasal == 0) { rootRewritesAsTaggedWordEg = t; } numNonPhrasal++; } } starts.IncrementCount(t2.Value()); } } foreach (Tree subtree in t) { ILabel lab = subtree.Label(); if (lab == null || lab.Value() == null || lab.Value().IsEmpty()) { if (numNullLabel == 0) { nullLabelEg = subtree; } numNullLabel++; if (lab == null) { subtree.SetLabel(new StringLabel(string.Empty)); } else { if (lab.Value() == null) { subtree.Label().SetValue(string.Empty); } } } if (subtree.IsLeaf()) { numWords++; words.Add(subtree.Value()); } else { if (subtree.IsPreTerminal()) { numTags++; tags.IncrementCount(subtree.Value()); if (tlp != null && tlp.IsPunctuationTag(subtree.Value())) { puncts.IncrementCount(subtree.FirstChild().Value()); } } else { if (subtree.IsPhrasal()) { bool hasLeafChild = false; foreach (Tree kt in subtree.Children()) { if (kt.IsLeaf()) { hasLeafChild = true; } } if (hasLeafChild) { numPreTerminalWithMultipleChildren++; if (preTerminalMultipleChildrenEg == null) { preTerminalMultipleChildrenEg = subtree; } } cats.IncrementCount(subtree.Value()); } else { throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree); } } } } } StringWriter sw = new StringWriter(2000); PrintWriter pw = new PrintWriter(sw); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(0); pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)"); if (numTrees > 0) { if (numTags != numWords) { pw.Println(" Warning! numTags differs and is " + numTags); } if (roots.Size() == 1) { string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0]; pw.Println(" The root category is: " + root); } else { pw.Println(" Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf)); } if (numNonUnaryRoots > 0) { pw.Print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. "); if (numNonUnaryRoots > 100) { pw.Print("First 100 "); } pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf)); pw.Println(" Example: " + nonUnaryEg); } if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) { pw.Println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word"); if (numLeaves > 0) { pw.Println(" Example bad root rewrites as leaf: " + leafEg); } if (numNonPhrasal > 0) { pw.Println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg); } } if (numNullLabel > 0) { pw.Println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:"); pw.Println(" " + nullLabelEg); } if (numPreTerminalWithMultipleChildren > 0) { pw.Println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children."); pw.Println(" Example: " + preTerminalMultipleChildrenEg); } pw.Println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words."); pw.Println(" " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types"); string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" }; // What a dopey choice using 0 as an empty element name!! // The problem with the below is that words aren't turned into a basic // category, but empties commonly are indexed.... Would need to look // for them with a suffix of -[0-9]+ ICollection <string> knownEmpties = Generics.NewHashSet(Arrays.AsList(empties)); ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties); if (!emptiesIntersection.IsEmpty()) { pw.Println(" Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection); } ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet()); if (!joint.IsEmpty()) { pw.Println(" Warning! " + joint.Count + " items are tags and categories: " + joint); } foreach (string cat in cats.KeySet()) { if (cat != null && cat.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat); break; } } foreach (string cat_1 in tags.KeySet()) { if (cat_1 != null && cat_1.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat_1); break; } } pw.Println(" Cats: " + Counters.ToString(cats, nf)); pw.Println(" Tags: " + Counters.ToString(tags, nf)); pw.Println(" " + starts.Size() + " start categories: " + Counters.ToString(starts, nf)); if (!puncts.IsEmpty()) { pw.Println(" Puncts: " + Counters.ToString(puncts, nf)); } } return(sw.ToString()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } // Process command-line options Properties options = StringUtils.ArgsToProperties(args, optionArgDefinitions); string fileName = options.GetProperty(string.Empty); if (fileName == null || fileName.Equals(string.Empty)) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; string encoding = options.GetProperty("e", "UTF-8"); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(fileName); // Statistics ICounter <string> binaryRuleTypes = new ClassicCounter <string>(20000); IList <int> branchingFactors = new List <int>(20000); int nTrees = 0; int nUnaryRules = 0; int nBinaryRules = 0; int binaryBranchingFactors = 0; // Read the treebank PrintWriter pw = tlpp.Pw(); foreach (Tree tree in tb) { if (tree.Value().Equals("ROOT")) { tree = tree.FirstChild(); } ++nTrees; foreach (Tree subTree in tree) { if (subTree.IsPhrasal()) { if (subTree.NumChildren() > 1) { ++nBinaryRules; branchingFactors.Add(subTree.NumChildren()); binaryBranchingFactors += subTree.NumChildren(); binaryRuleTypes.IncrementCount(TreeToRuleString(subTree)); } else { ++nUnaryRules; } } } } double mean = (double)binaryBranchingFactors / (double)nBinaryRules; System.Console.Out.Printf("#trees:\t%d%n", nTrees); System.Console.Out.Printf("#binary:\t%d%n", nBinaryRules); System.Console.Out.Printf("#binary types:\t%d%n", binaryRuleTypes.KeySet().Count); System.Console.Out.Printf("mean branching:\t%.4f%n", mean); System.Console.Out.Printf("stddev branching:\t%.4f%n", StandardDeviation(branchingFactors, mean)); System.Console.Out.Printf("rule entropy:\t%.5f%n", Counters.Entropy(binaryRuleTypes)); System.Console.Out.Printf("#unaries:\t%d%n", nUnaryRules); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; string puncTag = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { puncTag = args[i++]; if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i]); } } ICounter <string> puncTypes = new ClassicCounter <string>(); foreach (Tree t in tb) { IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel word in yield) { if (word.Tag().Equals(puncTag)) { puncTypes.IncrementCount(word.Word()); } } } IList <string> biggestKeys = new List <string>(puncTypes.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(puncTypes)); PrintWriter pw = tlpp.Pw(); foreach (string wordType in biggestKeys) { pw.Printf("%s\t%d%n", wordType, (int)puncTypes.GetCount(wordType)); } pw.Close(); }