/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary>
        /// <param name="datum">with a collection of features.</param>
        /// <param name="featureDocCounts">a counter of doc-count for each feature.</param>
        /// <returns>RVFDatum with l1-normalized tf-idf features.</returns>
        public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts)
        {
            ICounter <F> tfidfFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                if (featureDocCounts.ContainsKey(feature))
                {
                    tfidfFeatures.IncrementCount(feature, 1.0);
                }
            }
            double l1norm = 0;

            foreach (F feature_1 in tfidfFeatures.KeySet())
            {
                double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5));
                double tf  = tfidfFeatures.GetCount(feature_1);
                tfidfFeatures.SetCount(feature_1, tf * idf);
                l1norm += tf * idf;
            }
            foreach (F feature_2 in tfidfFeatures.KeySet())
            {
                double tfidf = tfidfFeatures.GetCount(feature_2);
                tfidfFeatures.SetCount(feature_2, tfidf / l1norm);
            }
            RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label());

            return(rvfDatum);
        }
 public virtual void FinishTraining()
 {
     // testing: get some stats here
     log.Info("Total tokens: " + tokens);
     log.Info("Total WordTag types: " + wtCount.KeySet().Count);
     log.Info("Total tag types: " + tagCount.KeySet().Count);
     log.Info("Total word types: " + seenWords.Count);
     /* find # of once-seen words for each tag */
     foreach (Pair <string, string> wt in wtCount.KeySet())
     {
         if (wtCount.GetCount(wt) == 1)
         {
             r1.IncrementCount(wt.Second());
         }
     }
     /* find # of unseen words for each tag */
     foreach (string tag in tagCount.KeySet())
     {
         foreach (string word in seenWords)
         {
             Pair <string, string> wt_1 = new Pair <string, string>(word, tag);
             if (!(wtCount.KeySet().Contains(wt_1)))
             {
                 r0.IncrementCount(tag);
             }
         }
     }
     /* set unseen word probability for each tag */
     foreach (string tag_1 in tagCount.KeySet())
     {
         float logprob = (float)Math.Log(r1.GetCount(tag_1) / (tagCount.GetCount(tag_1) * r0.GetCount(tag_1)));
         unknownGT[tag_1] = float.ValueOf(logprob);
     }
 }
示例#3
0
        protected internal virtual void InitRulesWithWord()
        {
            if (testOptions.verbose || DebugLexicon)
            {
                log.Info("Initializing lexicon scores ... ");
            }
            // int numWords = words.size()+sigs.size()+1;
            int unkWord  = wordIndex.AddToIndex(LexiconConstants.UnknownWord);
            int numWords = wordIndex.Size();

            rulesWithWord = new IList[numWords];
            for (int w = 0; w < numWords; w++)
            {
                rulesWithWord[w] = new List <IntTaggedWord>(1);
            }
            // most have 1 or 2
            // items in them
            // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) {
            tags = Generics.NewHashSet();
            foreach (IntTaggedWord iTW in seenCounter.KeySet())
            {
                if (iTW.Word() == nullWord && iTW.Tag() != nullTag)
                {
                    tags.Add(iTW);
                }
            }
            // tags for unknown words
            foreach (IntTaggedWord iT in tags)
            {
                double types = uwModel.UnSeenCounter().GetCount(iT);
                if (types > trainOptions.openClassTypesThreshold)
                {
                    // Number of types before it's treated as open class
                    IntTaggedWord iTW_1 = new IntTaggedWord(unkWord, iT.tag);
                    rulesWithWord[iTW_1.word].Add(iTW_1);
                }
            }
            if (testOptions.verbose || DebugLexicon)
            {
                StringBuilder sb = new StringBuilder();
                sb.Append("The ").Append(rulesWithWord[unkWord].Count).Append(" open class tags are: [");
                foreach (IntTaggedWord item in rulesWithWord[unkWord])
                {
                    sb.Append(' ').Append(tagIndex.Get(item.Tag()));
                }
                sb.Append(" ]");
                log.Info(sb.ToString());
            }
            foreach (IntTaggedWord iTW_2 in seenCounter.KeySet())
            {
                if (iTW_2.Tag() != nullTag && iTW_2.Word() != nullWord)
                {
                    rulesWithWord[iTW_2.word].Add(iTW_2);
                }
            }
        }
        /// <summary>Need to sort the counter by feature keys and dump it</summary>
        public static void PrintSVMLightFormat(PrintWriter pw, ClassicCounter <int> c, int classNo)
        {
            int[] features = Sharpen.Collections.ToArray(c.KeySet(), new int[c.KeySet().Count]);
            Arrays.Sort(features);
            StringBuilder sb = new StringBuilder();

            sb.Append(classNo);
            sb.Append(' ');
            foreach (int f in features)
            {
                sb.Append(f + 1).Append(':').Append(c.GetCount(f)).Append(' ');
            }
            pw.Println(sb.ToString());
        }
 public override IUnknownWordModel FinishTraining()
 {
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c)
     {
         /* outer iteration is over tags */
         ILabel key = entry.Key;
         ClassicCounter <string> wc = entry.Value;
         // counts for words given a tag
         if (!tagHash.Contains(key))
         {
             tagHash[key] = new ClassicCounter <string>();
         }
         /* the UNKNOWN sequence is assumed to be seen once in each tag */
         // This is sort of broken, but you can regard it as a Dirichlet prior.
         tc.IncrementCount(key);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         /* inner iteration is over words */
         foreach (string end in wc.KeySet())
         {
             double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key)));
             // p(sig|tag)
             tagHash[key].SetCount(end, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
     return(model);
 }
        /// <exception cref="System.IO.IOException"/>
        private void WriteObject(ObjectOutputStream stream)
        {
            //    log.info("\nBefore compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> fullArgCounter = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency in fullArgCounter.KeySet())
            {
                if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1)
                {
                    argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency));
                }
            }
            ClassicCounter <IntDependency> fullStopCounter = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency_1 in fullStopCounter.KeySet())
            {
                if (dependency_1.head.word != -1)
                {
                    stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1));
                }
            }
            //    log.info("After compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            stream.DefaultWriteObject();
            argCounter  = fullArgCounter;
            stopCounter = fullStopCounter;
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private void ReadObject(ObjectInputStream stream)
        {
            stream.DefaultReadObject();
            //    log.info("Before decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> compressedArgC = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            ClassicCounter <IntDependency> compressedStopC = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency d in compressedArgC.KeySet())
            {
                double count = compressedArgC.GetCount(d);
                ExpandArg(d, d.distance, count);
            }
            foreach (IntDependency d_1 in compressedStopC.KeySet())
            {
                double count = compressedStopC.GetCount(d_1);
                ExpandStop(d_1, d_1.distance, count, false);
            }
            //    log.info("After decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            expandDependencyMap = null;
        }
 public override IUnknownWordModel FinishTraining()
 {
     // Map<String,Float> unknownGT = null;
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     // unknownGT = unknownGTTrainer.unknownGT;
     foreach (ILabel tagLab in c.Keys)
     {
         // outer iteration is over tags as Labels
         ClassicCounter <string> wc = c[tagLab];
         // counts for words given a tag
         if (!tagHash.Contains(tagLab))
         {
             tagHash[tagLab] = new ClassicCounter <string>();
         }
         // the UNKNOWN first character is assumed to be seen once in
         // each tag
         // this is really sort of broken!  (why??)
         tc.IncrementCount(tagLab);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         // inner iteration is over words  as strings
         foreach (string first in wc.KeySet())
         {
             double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab));
             tagHash[tagLab].SetCount(first, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
     return(model);
 }
示例#9
0
        public virtual void ClassifyMentions(IList <IList <Mention> > predictedMentions, Dictionaries dict, Properties props)
        {
            ICollection <string> neStrings = Generics.NewHashSet();

            foreach (IList <Mention> predictedMention in predictedMentions)
            {
                foreach (Mention m in predictedMention)
                {
                    string ne = m.headWord.Ner();
                    if (ne.Equals("O"))
                    {
                        continue;
                    }
                    foreach (CoreLabel cl in m.originalSpan)
                    {
                        if (!cl.Ner().Equals(ne))
                        {
                            continue;
                        }
                    }
                    neStrings.Add(m.LowercaseNormalizedSpanString());
                }
            }
            foreach (IList <Mention> predicts in predictedMentions)
            {
                IDictionary <int, ICollection <Mention> > headPositions = Generics.NewHashMap();
                foreach (Mention p in predicts)
                {
                    if (!headPositions.Contains(p.headIndex))
                    {
                        headPositions[p.headIndex] = Generics.NewHashSet();
                    }
                    headPositions[p.headIndex].Add(p);
                }
                ICollection <Mention> remove = Generics.NewHashSet();
                foreach (int hPos in headPositions.Keys)
                {
                    ICollection <Mention> shares = headPositions[hPos];
                    if (shares.Count > 1)
                    {
                        ICounter <Mention> probs = new ClassicCounter <Mention>();
                        foreach (Mention p_1 in shares)
                        {
                            double trueProb = ProbabilityOf(p_1, shares, neStrings, dict, props);
                            probs.IncrementCount(p_1, trueProb);
                        }
                        // add to remove
                        Mention keep = Counters.Argmax(probs, null);
                        probs.Remove(keep);
                        Sharpen.Collections.AddAll(remove, probs.KeySet());
                    }
                }
                foreach (Mention r in remove)
                {
                    predicts.Remove(r);
                }
            }
        }
        /// <summary>Returns a list of all modes in the Collection.</summary>
        /// <remarks>
        /// Returns a list of all modes in the Collection.  (If the Collection has multiple items with the
        /// highest frequency, all of them will be returned.)
        /// </remarks>
        public static ICollection <T> Modes <T>(ICollection <T> values)
        {
            ICounter <T>   counter      = new ClassicCounter <T>(values);
            IList <double> sortedCounts = Edu.Stanford.Nlp.Util.CollectionUtils.Sorted(counter.Values());
            double         highestCount = sortedCounts[sortedCounts.Count - 1];

            Counters.RetainAbove(counter, highestCount);
            return(counter.KeySet());
        }
示例#11
0
            private static void Display <T>(ClassicCounter <T> c, PrintWriter pw)
            {
                IList <T> cats = new List <T>(c.KeySet());

                cats.Sort(Counters.ToComparatorDescending(c));
                foreach (T ob in cats)
                {
                    pw.Println(ob + " " + c.GetCount(ob));
                }
            }
        public virtual void RunCoref(Document document)
        {
            Compressor <string> compressor = new Compressor <string>();

            if (Thread.Interrupted())
            {
                // Allow interrupting
                throw new RuntimeInterruptedException();
            }
            IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>();

            foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch))
            {
                foreach (int m1 in e.Value)
                {
                    pairs[new Pair <int, int>(m1, e.Key)] = true;
                }
            }
            DocumentExamples            examples       = extractor.Extract(0, document, pairs, compressor);
            ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >();

            foreach (Example mentionPair in examples.examples)
            {
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor));
            }
            IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet());

            mentionPairs.Sort(null);
            ICollection <int> seenAnaphors = new HashSet <int>();

            foreach (Pair <int, int> pair in mentionPairs)
            {
                if (seenAnaphors.Contains(pair.second))
                {
                    continue;
                }
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                seenAnaphors.Add(pair.second);
                Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType;
                Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType;
                if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)])
                {
                    CorefUtils.MergeCoreferenceClusters(pair, document);
                }
            }
        }
        public override IDependencyGrammar FormResult()
        {
            wordIndex.AddToIndex(LexiconConstants.UnknownWord);
            MLEDependencyGrammar dg = new MLEDependencyGrammar(tlpParams, directional, useDistance, useCoarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex);

            foreach (IntDependency dependency in dependencyCounter.KeySet())
            {
                dg.AddRule(dependency, dependencyCounter.GetCount(dependency));
            }
            return(dg);
        }
        public virtual void DumpStats()
        {
            System.Console.Out.WriteLine("%% Counts of nonterminals:");
            IList <string> biggestCounts = new List <string>(nonTerms.KeySet());

            biggestCounts.Sort(Counters.ToComparatorDescending(nonTerms));
            foreach (string str in biggestCounts)
            {
                System.Console.Out.WriteLine(str + ": " + nonTerms.GetCount(str));
            }
        }
        public static TransducerGraph CreateGraphFromPaths <T>(ClassicCounter <IList <T> > pathCounter, int markovOrder)
        {
            TransducerGraph graph = new TransducerGraph();

            // empty
            foreach (IList <T> path in pathCounter.KeySet())
            {
                double count = pathCounter.GetCount(path);
                AddOnePathToGraph(path, count, markovOrder, graph);
            }
            return(graph);
        }
示例#16
0
        /// <summary>
        /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter
        /// using the actual features and labels.
        /// </summary>
        /// <remarks>
        /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter
        /// using the actual features and labels.  Because this is svm_light, and not svm_struct, the
        /// weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class
        /// (which correspond to labelIndex.get(1)) are just the negation of one another.
        /// </remarks>
        private ClassicCounter <Pair <F, L> > ConvertSVMLightWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex)
        {
            ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >();

            foreach (int i in weights.KeySet())
            {
                F      f = featureIndex.Get(i - 1);
                double w = weights.GetCount(i);
                // the first guy in the labelIndex was the +1 class and the second guy
                // was the -1 class
                newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(0)), w);
                newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(1)), -w);
            }
            return(newWeights);
        }
示例#17
0
        /// <summary>
        /// Converts the svm_struct weight Counter (in which the weight for a feature/label pair
        /// correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter
        /// using the actual features and labels.
        /// </summary>
        private ClassicCounter <Pair <F, L> > ConvertSVMStructWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex)
        {
            // int numLabels = labelIndex.size();
            int numFeatures = featureIndex.Size();
            ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >();

            foreach (int i in weights.KeySet())
            {
                L l = labelIndex.Get((i - 1) / numFeatures);
                // integer division on purpose
                F      f = featureIndex.Get((i - 1) % numFeatures);
                double w = weights.GetCount(i);
                newWeights.IncrementCount(new Pair <F, L>(f, l), w);
            }
            return(newWeights);
        }
示例#18
0
            private static void Display <T>(ClassicCounter <T> c, int num, PrintWriter pw)
            {
                IList <T> rules = new List <T>(c.KeySet());

                rules.Sort(Counters.ToComparatorDescending(c));
                int rSize = rules.Count;

                if (num > rSize)
                {
                    num = rSize;
                }
                for (int i = 0; i < num; i++)
                {
                    pw.Println(rules[i] + " " + c.GetCount(rules[i]));
                }
            }
示例#19
0
        // todo: Fix javadoc, have unit tested
        /// <summary>Print SVM Light Format file.</summary>
        /// <remarks>
        /// Print SVM Light Format file.
        /// The following comments are no longer applicable because I am
        /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009.
        /// If the Dataset has more than 2 classes, then it
        /// prints using the label index (+1) (for svm_struct).  If it is 2 classes, then the labelIndex.get(0)
        /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light).
        /// </remarks>
        public virtual void PrintSVMLightFormat(PrintWriter pw)
        {
            //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter
            // old comment:
            // the following code commented out by Ramesh ([email protected]) 12/17/2009.
            // why not simply print the exact id of the label instead of mapping to some values??
            // new comment:
            // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels,
            //        e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010)
            string[] labelMap = MakeSvmLabelMap();
            for (int i = 0; i < size; i++)
            {
                RVFDatum <L, F>      d      = GetRVFDatum(i);
                ICounter <F>         c      = d.AsFeaturesCounter();
                ClassicCounter <int> printC = new ClassicCounter <int>();
                foreach (F f in c.KeySet())
                {
                    printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f));
                }
                int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]);
                Arrays.Sort(features);
                StringBuilder sb = new StringBuilder();
                sb.Append(labelMap[labels[i]]).Append(' ');
                // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions!

                /* Old code: assumes that F is Integer....
                 *
                 * for (int f: features) {
                 * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" ");
                 * }
                 */
                //I think this is what was meant (using printC rather than c), but not sure
                // ~Sarah Spikes ([email protected])
                foreach (int f_1 in features)
                {
                    sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' ');
                }
                pw.Println(sb.ToString());
            }
        }
        private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, IList <Pair <int, int> > mentionPairs, ICounter <Pair <int, int> > scores)
        {
            ICounter <string> features  = new ClassicCounter <string>();
            double            maxScore  = 0;
            double            minScore  = 1;
            ICounter <string> totals    = new ClassicCounter <string>();
            ICounter <string> totalsLog = new ClassicCounter <string>();
            ICounter <string> counts    = new ClassicCounter <string>();

            foreach (Pair <int, int> mentionPair in mentionPairs)
            {
                if (!scores.ContainsKey(mentionPair))
                {
                    mentionPair = new Pair <int, int>(mentionPair.second, mentionPair.first);
                }
                double score    = scores.GetCount(mentionPair);
                double logScore = CappedLog(score);
                string mt1      = doc.mentionTypes[mentionPair.first];
                string mt2      = doc.mentionTypes[mentionPair.second];
                mt1 = mt1.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL";
                mt2 = mt2.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL";
                string conj = "_" + mt1 + "_" + mt2;
                maxScore = Math.Max(maxScore, score);
                minScore = Math.Min(minScore, score);
                totals.IncrementCount(string.Empty, score);
                totalsLog.IncrementCount(string.Empty, logScore);
                counts.IncrementCount(string.Empty);
                totals.IncrementCount(conj, score);
                totalsLog.IncrementCount(conj, logScore);
                counts.IncrementCount(conj);
            }
            features.IncrementCount("max", maxScore);
            features.IncrementCount("min", minScore);
            foreach (string key in counts.KeySet())
            {
                features.IncrementCount("avg" + key, totals.GetCount(key) / mentionPairs.Count);
                features.IncrementCount("avgLog" + key, totalsLog.GetCount(key) / mentionPairs.Count);
            }
            return(features);
        }
 /// <summary>Writes out data from this Object to the Writer w.</summary>
 /// <exception cref="System.IO.IOException"/>
 public override void WriteData(PrintWriter @out)
 {
     // all lines have one rule per line
     foreach (IntDependency dependency in argCounter.KeySet())
     {
         if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1)
         {
             double count = argCounter.GetCount(dependency);
             @out.Println(dependency.ToString(wordIndex, tagIndex) + " " + count);
         }
     }
     @out.Println("BEGIN_STOP");
     foreach (IntDependency dependency_1 in stopCounter.KeySet())
     {
         if (dependency_1.head.word != -1)
         {
             double count = stopCounter.GetCount(dependency_1);
             @out.Println(dependency_1.ToString(wordIndex, tagIndex) + " " + count);
         }
     }
     @out.Flush();
 }
示例#22
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }
示例#23
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank tb       = null;
            string       encoding = "UTF-8";
            Language     lang     = Language.English;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i]);
                }
            }
            PrintWriter pw = tlpp.Pw();
            Options     op = new Options();

            Options.LexOptions lexOptions = op.lexOptions;
            if (lang == Language.French)
            {
                lexOptions.useUnknownWordSignatures = 1;
                lexOptions.smartMutation            = false;
                lexOptions.unknownSuffixSize        = 2;
                lexOptions.unknownPrefixSize        = 1;
            }
            else
            {
                if (lang == Language.Arabic)
                {
                    lexOptions.smartMutation            = false;
                    lexOptions.useUnknownWordSignatures = 9;
                    lexOptions.unknownPrefixSize        = 1;
                    lexOptions.unknownSuffixSize        = 1;
                }
            }
            IIndex <string>   wordIndex    = new HashIndex <string>();
            IIndex <string>   tagIndex     = new HashIndex <string>();
            ILexicon          lex          = tlpp.Lex(op, wordIndex, tagIndex);
            int               computeAfter = (int)(0.50 * tb.Count);
            ICounter <string> vocab        = new ClassicCounter <string>();
            ICounter <string> unkCounter   = new ClassicCounter <string>();
            int               treeId       = 0;

            foreach (Tree t in tb)
            {
                IList <ILabel> yield = t.Yield();
                int            posId = 0;
                foreach (ILabel word in yield)
                {
                    vocab.IncrementCount(word.Value());
                    if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0)
                    {
                        //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                        //            pw.println(word.value());
                        unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++));
                    }
                }
                treeId++;
            }
            IList <string> biggestKeys = new List <string>(unkCounter.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter));
            foreach (string wordType in biggestKeys)
            {
                pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType));
            }
            pw.Close();
            pw.Close();
        }
示例#24
0
        /// <summary>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses.
        /// </summary>
        /// <remarks>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses. In other words, the evaluation file has the
        /// same format as the training data.
        /// </remarks>
        /// <param name="pwOut"/>
        private void Evaluate(PrintWriter pwOut)
        {
            log.Info("Starting evaluation...");
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.testFile, docReader);
            PrintWriter tedEvalGoldTree  = null;
            PrintWriter tedEvalParseTree = null;
            PrintWriter tedEvalGoldSeg   = null;
            PrintWriter tedEvalParseSeg  = null;

            if (tedEvalPrefix != null)
            {
                try
                {
                    tedEvalGoldTree  = new PrintWriter(tedEvalPrefix + "_gold.ftree");
                    tedEvalGoldSeg   = new PrintWriter(tedEvalPrefix + "_gold.segmentation");
                    tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree");
                    tedEvalParseSeg  = new PrintWriter(tedEvalPrefix + "_parse.segmentation");
                }
                catch (FileNotFoundException e)
                {
                    System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message);
                }
            }
            ICounter <string> labelTotal   = new ClassicCounter <string>();
            ICounter <string> labelCorrect = new ClassicCounter <string>();
            int total   = 0;
            int correct = 0;

            foreach (IList <CoreLabel> line in lines)
            {
                string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" ");
                string[] goldTokens  = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                line = classifier.Classify(line);
                string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                foreach (CoreLabel label in line)
                {
                    // Do not evaluate labeling of whitespace
                    string observation = label.Get(typeof(CoreAnnotations.CharAnnotation));
                    if (!observation.Equals(IOBUtils.GetBoundaryCharacter()))
                    {
                        total++;
                        string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation));
                        string reference  = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation));
                        labelTotal.IncrementCount(reference);
                        if (hypothesis.Equals(reference))
                        {
                            correct++;
                            labelCorrect.IncrementCount(reference);
                        }
                    }
                }
                if (tedEvalParseSeg != null)
                {
                    tedEvalGoldTree.Printf("(root");
                    tedEvalParseTree.Printf("(root");
                    int safeLength = inputTokens.Length;
                    if (inputTokens.Length != goldTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  gold: " + Arrays.ToString(goldTokens));
                        safeLength = Math.Min(inputTokens.Length, goldTokens.Length);
                    }
                    if (inputTokens.Length != parseTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  parse: " + Arrays.ToString(parseTokens));
                        safeLength = Math.Min(inputTokens.Length, parseTokens.Length);
                    }
                    for (int i = 0; i < safeLength; i++)
                    {
                        foreach (string segment in goldTokens[i].Split(":"))
                        {
                            tedEvalGoldTree.Printf(" (seg %s)", segment);
                        }
                        tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]);
                        foreach (string segment_1 in parseTokens[i].Split(":"))
                        {
                            tedEvalParseTree.Printf(" (seg %s)", segment_1);
                        }
                        tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]);
                    }
                    tedEvalGoldTree.Printf(")%n");
                    tedEvalGoldSeg.Println();
                    tedEvalParseTree.Printf(")%n");
                    tedEvalParseSeg.Println();
                }
            }
            double accuracy = ((double)correct) / ((double)total);

            accuracy *= 100.0;
            pwOut.Println("EVALUATION RESULTS");
            pwOut.Printf("#datums:\t%d%n", total);
            pwOut.Printf("#correct:\t%d%n", correct);
            pwOut.Printf("accuracy:\t%.2f%n", accuracy);
            pwOut.Println("==================");
            // Output the per label accuracies
            pwOut.Println("PER LABEL ACCURACIES");
            foreach (string refLabel in labelTotal.KeySet())
            {
                double nTotal   = labelTotal.GetCount(refLabel);
                double nCorrect = labelCorrect.GetCount(refLabel);
                double acc      = (nCorrect / nTotal) * 100.0;
                pwOut.Printf(" %s\t%.2f%n", refLabel, acc);
            }
            if (tedEvalParseSeg != null)
            {
                tedEvalGoldTree.Close();
                tedEvalGoldSeg.Close();
                tedEvalParseTree.Close();
                tedEvalParseSeg.Close();
            }
        }
示例#25
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank  tb        = null;
            string        encoding  = "UTF-8";
            TregexPattern rootMatch = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    rootMatch = TregexPattern.Compile("@" + args[i++]);
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i++]);
                }
            }
            ICounter <string> rhsCounter = new ClassicCounter <string>();

            foreach (Tree t in tb)
            {
                TregexMatcher m = rootMatch.Matcher(t);
                while (m.FindNextMatchingNode())
                {
                    Tree          match = m.GetMatch();
                    StringBuilder sb    = new StringBuilder();
                    foreach (Tree kid in match.Children())
                    {
                        sb.Append(kid.Value()).Append(" ");
                    }
                    rhsCounter.IncrementCount(sb.ToString().Trim());
                }
            }
            IList <string> biggestKeys = new List <string>(rhsCounter.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(rhsCounter));
            PrintWriter pw = tlpp.Pw();

            foreach (string rhs in biggestKeys)
            {
                pw.Printf("%s\t%d%n", rhs, (int)rhsCounter.GetCount(rhs));
            }
            pw.Close();
        }
示例#26
0
        public override void PrintResults(PrintWriter pw, IList <ICoreMap> goldStandard, IList <ICoreMap> extractorOutput)
        {
            ResultsPrinter.Align(goldStandard, extractorOutput);
            ICounter <string> correct   = new ClassicCounter <string>();
            ICounter <string> predicted = new ClassicCounter <string>();
            ICounter <string> gold      = new ClassicCounter <string>();

            for (int i = 0; i < goldStandard.Count; i++)
            {
                ICoreMap goldSent = goldStandard[i];
                ICoreMap sysSent  = extractorOutput[i];
                string   sysText  = sysSent.Get(typeof(CoreAnnotations.TextAnnotation));
                string   goldText = goldSent.Get(typeof(CoreAnnotations.TextAnnotation));
                if (verbose)
                {
                    log.Info("SCORING THE FOLLOWING SENTENCE:");
                    log.Info(sysSent.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
                HashSet <string>      matchedGolds = new HashSet <string>();
                IList <EntityMention> goldEntities = goldSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                if (goldEntities == null)
                {
                    goldEntities = new List <EntityMention>();
                }
                foreach (EntityMention m in goldEntities)
                {
                    string label = MakeLabel(m);
                    if (excludedClasses != null && excludedClasses.Contains(label))
                    {
                        continue;
                    }
                    gold.IncrementCount(label);
                }
                IList <EntityMention> sysEntities = sysSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                if (sysEntities == null)
                {
                    sysEntities = new List <EntityMention>();
                }
                foreach (EntityMention m_1 in sysEntities)
                {
                    string label = MakeLabel(m_1);
                    if (excludedClasses != null && excludedClasses.Contains(label))
                    {
                        continue;
                    }
                    predicted.IncrementCount(label);
                    if (verbose)
                    {
                        log.Info("COMPARING PREDICTED MENTION: " + m_1);
                    }
                    bool found = false;
                    foreach (EntityMention gm in goldEntities)
                    {
                        if (matchedGolds.Contains(gm.GetObjectId()))
                        {
                            continue;
                        }
                        if (verbose)
                        {
                            log.Info("\tagainst: " + gm);
                        }
                        if (gm.Equals(m_1, useSubTypes))
                        {
                            if (verbose)
                            {
                                log.Info("\t\t\tMATCH!");
                            }
                            found = true;
                            matchedGolds.Add(gm.GetObjectId());
                            if (verboseInstances)
                            {
                                log.Info("TRUE POSITIVE: " + m_1 + " matched " + gm);
                                log.Info("In sentence: " + sysText);
                            }
                            break;
                        }
                    }
                    if (found)
                    {
                        correct.IncrementCount(label);
                    }
                    else
                    {
                        if (verboseInstances)
                        {
                            log.Info("FALSE POSITIVE: " + m_1.ToString());
                            log.Info("In sentence: " + sysText);
                        }
                    }
                }
                if (verboseInstances)
                {
                    foreach (EntityMention m_2 in goldEntities)
                    {
                        string label = MakeLabel(m_2);
                        if (!matchedGolds.Contains(m_2.GetObjectId()) && (excludedClasses == null || !excludedClasses.Contains(label)))
                        {
                            log.Info("FALSE NEGATIVE: " + m_2.ToString());
                            log.Info("In sentence: " + goldText);
                        }
                    }
                }
            }
            double totalCount     = 0;
            double totalCorrect   = 0;
            double totalPredicted = 0;

            pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF");
            IList <string> labels = new List <string>(gold.KeySet());

            labels.Sort();
            foreach (string label_1 in labels)
            {
                if (excludedClasses != null && excludedClasses.Contains(label_1))
                {
                    continue;
                }
                double numCorrect   = correct.GetCount(label_1);
                double numPredicted = predicted.GetCount(label_1);
                double trueCount    = gold.GetCount(label_1);
                double precision    = (numPredicted > 0) ? (numCorrect / numPredicted) : 0;
                double recall       = numCorrect / trueCount;
                double f            = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0;
                pw.Println(StringUtils.PadOrTrim(label_1, 21) + "\t" + numCorrect + "\t" + numPredicted + "\t" + trueCount + "\t" + Formatter.Format(precision * 100) + "\t" + Formatter.Format(100 * recall) + "\t" + Formatter.Format(100 * f));
                totalCount     += trueCount;
                totalCorrect   += numCorrect;
                totalPredicted += numPredicted;
            }
            double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0;
            double recall_1    = totalCorrect / totalCount;
            double f_1         = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0;

            pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + Formatter.Format(100 * precision_1) + "\t" + Formatter.Format(100 * recall_1) + "\t" + Formatter.Format(100 * f_1));
        }
        /// <summary>
        /// Return various statistics about the treebank (number of sentences,
        /// words, tag set, etc.).
        /// </summary>
        /// <param name="tlp">
        /// The TreebankLanguagePack used to determine punctuation and an
        /// appropriate character encoding
        /// </param>
        /// <returns>A big string for human consumption describing the treebank</returns>
        public virtual string TextualSummary(ITreebankLanguagePack tlp)
        {
            int  numTrees         = 0;
            int  numTreesLE40     = 0;
            int  numNonUnaryRoots = 0;
            Tree nonUnaryEg       = null;
            ClassicCounter <Tree>   nonUnaries = new ClassicCounter <Tree>();
            ClassicCounter <string> roots      = new ClassicCounter <string>();
            ClassicCounter <string> starts     = new ClassicCounter <string>();
            ClassicCounter <string> puncts     = new ClassicCounter <string>();
            int numUnenclosedLeaves            = 0;
            int numLeaves     = 0;
            int numNonPhrasal = 0;
            int numPreTerminalWithMultipleChildren = 0;
            int numWords                       = 0;
            int numTags                        = 0;
            int shortestSentence               = int.MaxValue;
            int longestSentence                = 0;
            int numNullLabel                   = 0;
            ICollection <string>    words      = Generics.NewHashSet();
            ClassicCounter <string> tags       = new ClassicCounter <string>();
            ClassicCounter <string> cats       = new ClassicCounter <string>();
            Tree leafEg                        = null;
            Tree preTerminalMultipleChildrenEg = null;
            Tree nullLabelEg                   = null;
            Tree rootRewritesAsTaggedWordEg    = null;

            foreach (Tree t in this)
            {
                roots.IncrementCount(t.Value());
                numTrees++;
                int leng = t.Yield().Count;
                if (leng <= 40)
                {
                    numTreesLE40++;
                }
                if (leng < shortestSentence)
                {
                    shortestSentence = leng;
                }
                if (leng > longestSentence)
                {
                    longestSentence = leng;
                }
                if (t.NumChildren() > 1)
                {
                    if (numNonUnaryRoots == 0)
                    {
                        nonUnaryEg = t;
                    }
                    if (numNonUnaryRoots < 100)
                    {
                        nonUnaries.IncrementCount(t.LocalTree());
                    }
                    numNonUnaryRoots++;
                }
                else
                {
                    if (t.IsLeaf())
                    {
                        numUnenclosedLeaves++;
                    }
                    else
                    {
                        Tree t2 = t.FirstChild();
                        if (t2.IsLeaf())
                        {
                            numLeaves++;
                            leafEg = t;
                        }
                        else
                        {
                            if (t2.IsPreTerminal())
                            {
                                if (numNonPhrasal == 0)
                                {
                                    rootRewritesAsTaggedWordEg = t;
                                }
                                numNonPhrasal++;
                            }
                        }
                        starts.IncrementCount(t2.Value());
                    }
                }
                foreach (Tree subtree in t)
                {
                    ILabel lab = subtree.Label();
                    if (lab == null || lab.Value() == null || lab.Value().IsEmpty())
                    {
                        if (numNullLabel == 0)
                        {
                            nullLabelEg = subtree;
                        }
                        numNullLabel++;
                        if (lab == null)
                        {
                            subtree.SetLabel(new StringLabel(string.Empty));
                        }
                        else
                        {
                            if (lab.Value() == null)
                            {
                                subtree.Label().SetValue(string.Empty);
                            }
                        }
                    }
                    if (subtree.IsLeaf())
                    {
                        numWords++;
                        words.Add(subtree.Value());
                    }
                    else
                    {
                        if (subtree.IsPreTerminal())
                        {
                            numTags++;
                            tags.IncrementCount(subtree.Value());
                            if (tlp != null && tlp.IsPunctuationTag(subtree.Value()))
                            {
                                puncts.IncrementCount(subtree.FirstChild().Value());
                            }
                        }
                        else
                        {
                            if (subtree.IsPhrasal())
                            {
                                bool hasLeafChild = false;
                                foreach (Tree kt in subtree.Children())
                                {
                                    if (kt.IsLeaf())
                                    {
                                        hasLeafChild = true;
                                    }
                                }
                                if (hasLeafChild)
                                {
                                    numPreTerminalWithMultipleChildren++;
                                    if (preTerminalMultipleChildrenEg == null)
                                    {
                                        preTerminalMultipleChildrenEg = subtree;
                                    }
                                }
                                cats.IncrementCount(subtree.Value());
                            }
                            else
                            {
                                throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree);
                            }
                        }
                    }
                }
            }
            StringWriter sw = new StringWriter(2000);
            PrintWriter  pw = new PrintWriter(sw);
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(0);
            pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
            if (numTrees > 0)
            {
                if (numTags != numWords)
                {
                    pw.Println("  Warning! numTags differs and is " + numTags);
                }
                if (roots.Size() == 1)
                {
                    string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0];
                    pw.Println("  The root category is: " + root);
                }
                else
                {
                    pw.Println("  Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf));
                }
                if (numNonUnaryRoots > 0)
                {
                    pw.Print("  Warning! " + numNonUnaryRoots + " trees without unary initial rewrite.  ");
                    if (numNonUnaryRoots > 100)
                    {
                        pw.Print("First 100 ");
                    }
                    pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf));
                    pw.Println("    Example: " + nonUnaryEg);
                }
                if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0)
                {
                    pw.Println("  Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
                    if (numLeaves > 0)
                    {
                        pw.Println("  Example bad root rewrites as leaf: " + leafEg);
                    }
                    if (numNonPhrasal > 0)
                    {
                        pw.Println("  Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
                    }
                }
                if (numNullLabel > 0)
                {
                    pw.Println("  Warning!  " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
                    pw.Println("    " + nullLabelEg);
                }
                if (numPreTerminalWithMultipleChildren > 0)
                {
                    pw.Println("  Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
                    pw.Println("    Example: " + preTerminalMultipleChildrenEg);
                }
                pw.Println("  Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
                pw.Println("  " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types");
                string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
                // What a dopey choice using 0 as an empty element name!!
                // The problem with the below is that words aren't turned into a basic
                // category, but empties commonly are indexed....  Would need to look
                // for them with a suffix of -[0-9]+
                ICollection <string> knownEmpties        = Generics.NewHashSet(Arrays.AsList(empties));
                ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties);
                if (!emptiesIntersection.IsEmpty())
                {
                    pw.Println("  Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection);
                }
                ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet());
                if (!joint.IsEmpty())
                {
                    pw.Println("  Warning! " + joint.Count + " items are tags and categories: " + joint);
                }
                foreach (string cat in cats.KeySet())
                {
                    if (cat != null && cat.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with categories containing '@' like: " + cat);
                        break;
                    }
                }
                foreach (string cat_1 in tags.KeySet())
                {
                    if (cat_1 != null && cat_1.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with tags containing '@' like: " + cat_1);
                        break;
                    }
                }
                pw.Println("    Cats: " + Counters.ToString(cats, nf));
                pw.Println("    Tags: " + Counters.ToString(tags, nf));
                pw.Println("    " + starts.Size() + " start categories: " + Counters.ToString(starts, nf));
                if (!puncts.IsEmpty())
                {
                    pw.Println("    Puncts: " + Counters.ToString(puncts, nf));
                }
            }
            return(sw.ToString());
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }
示例#29
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            // Process command-line options
            Properties options  = StringUtils.ArgsToProperties(args, optionArgDefinitions);
            string     fileName = options.GetProperty(string.Empty);

            if (fileName == null || fileName.Equals(string.Empty))
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            string encoding = options.GetProperty("e", "UTF-8");

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            DiskTreebank tb = tlpp.DiskTreebank();

            tb.LoadPath(fileName);
            // Statistics
            ICounter <string> binaryRuleTypes  = new ClassicCounter <string>(20000);
            IList <int>       branchingFactors = new List <int>(20000);
            int nTrees                 = 0;
            int nUnaryRules            = 0;
            int nBinaryRules           = 0;
            int binaryBranchingFactors = 0;
            // Read the treebank
            PrintWriter pw = tlpp.Pw();

            foreach (Tree tree in tb)
            {
                if (tree.Value().Equals("ROOT"))
                {
                    tree = tree.FirstChild();
                }
                ++nTrees;
                foreach (Tree subTree in tree)
                {
                    if (subTree.IsPhrasal())
                    {
                        if (subTree.NumChildren() > 1)
                        {
                            ++nBinaryRules;
                            branchingFactors.Add(subTree.NumChildren());
                            binaryBranchingFactors += subTree.NumChildren();
                            binaryRuleTypes.IncrementCount(TreeToRuleString(subTree));
                        }
                        else
                        {
                            ++nUnaryRules;
                        }
                    }
                }
            }
            double mean = (double)binaryBranchingFactors / (double)nBinaryRules;

            System.Console.Out.Printf("#trees:\t%d%n", nTrees);
            System.Console.Out.Printf("#binary:\t%d%n", nBinaryRules);
            System.Console.Out.Printf("#binary types:\t%d%n", binaryRuleTypes.KeySet().Count);
            System.Console.Out.Printf("mean branching:\t%.4f%n", mean);
            System.Console.Out.Printf("stddev branching:\t%.4f%n", StandardDeviation(branchingFactors, mean));
            System.Console.Out.Printf("rule entropy:\t%.5f%n", Counters.Entropy(binaryRuleTypes));
            System.Console.Out.Printf("#unaries:\t%d%n", nUnaryRules);
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank tb       = null;
            string       encoding = "UTF-8";
            string       puncTag  = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    puncTag = args[i++];
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i]);
                }
            }
            ICounter <string> puncTypes = new ClassicCounter <string>();

            foreach (Tree t in tb)
            {
                IList <CoreLabel> yield = t.TaggedLabeledYield();
                foreach (CoreLabel word in yield)
                {
                    if (word.Tag().Equals(puncTag))
                    {
                        puncTypes.IncrementCount(word.Word());
                    }
                }
            }
            IList <string> biggestKeys = new List <string>(puncTypes.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(puncTypes));
            PrintWriter pw = tlpp.Pw();

            foreach (string wordType in biggestKeys)
            {
                pw.Printf("%s\t%d%n", wordType, (int)puncTypes.GetCount(wordType));
            }
            pw.Close();
        }