public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            int wordId = iTW.Word();
            int tagId  = iTW.Tag();
            // Force 1-best path to go through the boundary symbol
            // (deterministic tagging)
            int boundaryId    = wordIndex.IndexOf(LexiconConstants.Boundary);
            int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag);

            if (wordId == boundaryId && tagId == boundaryTagId)
            {
                return(0.0f);
            }
            // Morphological features
            string tag = tagIndex.Get(iTW.Tag());
            Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec);
            string lemma           = lemmaMorph.First();
            int    lemmaId         = wordIndex.IndexOf(lemma);
            string richMorphTag    = lemmaMorph.Second();
            string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();

            reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag;
            int morphId = morphIndex.AddToIndex(reducedMorphTag);
            // Score the factors and create the rule score p_W_T
            double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId));
            //    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
            double p_L_T = 0.0;
            double p_M_T = Math.Log(ProbMorphTag(tagId, morphId));
            double p_W_T = p_W_Tf + p_L_T + p_M_T;

            //      String tag = tagIndex.get(tagId);
            // Filter low probability taggings
            return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity);
        }
Exemple #2
0
 /// <summary>Adds the tagging with count to the data structures in this Lexicon.</summary>
 protected internal virtual void AddTagging(bool seen, IntTaggedWord itw, double count)
 {
     if (seen)
     {
         seenCounter.IncrementCount(itw, count);
         if (itw.Tag() == nullTag)
         {
             words.Add(itw);
         }
         else
         {
             if (itw.Word() == nullWord)
             {
                 tags.Add(itw);
             }
         }
     }
     else
     {
         // rules.add(itw);
         uwModel.AddTagging(seen, itw, count);
     }
 }
        /// <summary>Do max language model markov segmentation.</summary>
        /// <remarks>
        /// Do max language model markov segmentation.
        /// Note that this algorithm inherently tags words as it goes, but that
        /// we throw away the tags in the final result so that the segmented words
        /// are untagged.  (Note: for a couple of years till Aug 2007, a tagged
        /// result was returned, but this messed up the parser, because it could
        /// use no tagging but the given tagging, which often wasn't very good.
        /// Or in particular it was a subcategorized tagging which never worked
        /// with the current forceTags option which assumes that gold taggings are
        /// inherently basic taggings.)
        /// </remarks>
        /// <param name="s">A String to segment</param>
        /// <returns>The list of segmented words.</returns>
        private List <IHasWord> SegmentWordsWithMarkov(string s)
        {
            // We don't want to accidentally register words that we don't know
            // about in the wordIndex, so we wrap it with a DeltaIndex
            DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex);
            int length = s.Length;
            //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
            int numTags = POSes.Count;

            // score of span with initial word of this tag
            double[][][] scores = new double[length][][];
            // best (length of) first word for this span with this tag
            int[][][] splitBacktrace = new int[length][][];
            // best tag for second word over this span, if first is this tag
            int[][][] POSbacktrace = new int[length][][];
            for (int i = 0; i < length; i++)
            {
                for (int j = 0; j < length + 1; j++)
                {
                    Arrays.Fill(scores[i][j], double.NegativeInfinity);
                }
            }
            // first fill in word probabilities
            for (int diff = 1; diff <= 10; diff++)
            {
                for (int start = 0; start + diff <= length; start++)
                {
                    int           end     = start + diff;
                    StringBuilder wordBuf = new StringBuilder();
                    for (int pos = start; pos < end; pos++)
                    {
                        wordBuf.Append(s[pos]);
                    }
                    string word = wordBuf.ToString();
                    foreach (string tag in POSes)
                    {
                        IntTaggedWord itw   = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex);
                        double        score = lex.Score(itw, 0, word, null);
                        if (start == 0)
                        {
                            score += Math.Log(initialPOSDist.ProbabilityOf(tag));
                        }
                        scores[start][end][itw.Tag()]         = score;
                        splitBacktrace[start][end][itw.Tag()] = end;
                    }
                }
            }
            // now fill in word combination probabilities
            for (int diff_1 = 2; diff_1 <= length; diff_1++)
            {
                for (int start = 0; start + diff_1 <= length; start++)
                {
                    int end = start + diff_1;
                    for (int split = start + 1; split < end && split - start <= 10; split++)
                    {
                        foreach (string tag in POSes)
                        {
                            int tagNum = tagIndex.AddToIndex(tag);
                            if (splitBacktrace[start][split][tagNum] != split)
                            {
                                continue;
                            }
                            Distribution <string> rTagDist = markovPOSDists[tag];
                            if (rTagDist == null)
                            {
                                continue;
                            }
                            // this happens with "*" POS
                            foreach (string rTag in POSes)
                            {
                                int    rTagNum  = tagIndex.AddToIndex(rTag);
                                double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.Log(rTagDist.ProbabilityOf(rTag));
                                if (newScore > scores[start][end][tagNum])
                                {
                                    scores[start][end][tagNum]         = newScore;
                                    splitBacktrace[start][end][tagNum] = split;
                                    POSbacktrace[start][end][tagNum]   = rTagNum;
                                }
                            }
                        }
                    }
                }
            }
            int             nextPOS = ArrayMath.Argmax(scores[0][length]);
            List <IHasWord> words   = new List <IHasWord>();
            int             start_1 = 0;

            while (start_1 < length)
            {
                int           split   = splitBacktrace[start_1][length][nextPOS];
                StringBuilder wordBuf = new StringBuilder();
                for (int i_1 = start_1; i_1 < split; i_1++)
                {
                    wordBuf.Append(s[i_1]);
                }
                string word = wordBuf.ToString();
                // String tag = tagIndex.get(nextPOS);
                // words.add(new TaggedWord(word, tag));
                words.Add(new Word(word));
                if (split < length)
                {
                    nextPOS = POSbacktrace[start_1][length][nextPOS];
                }
                start_1 = split;
            }
            return(words);
        }
        // CDM 2007: I wonder what this does differently from segmentWordsWithMarkov???
        private List <TaggedWord> BasicSegmentWords(string s)
        {
            // We don't want to accidentally register words that we don't know
            // about in the wordIndex, so we wrap it with a DeltaIndex
            DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex);
            int length = s.Length;

            //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
            // best score of span
            double[][] scores = new double[length][];
            // best (last index of) first word for this span
            int[][] splitBacktrace = new int[length][];
            // best tag for word over this span
            int[][] POSbacktrace = new int[length][];
            for (int i = 0; i < length; i++)
            {
                Arrays.Fill(scores[i], double.NegativeInfinity);
            }
            // first fill in word probabilities
            for (int diff = 1; diff <= 10; diff++)
            {
                for (int start = 0; start + diff <= length; start++)
                {
                    int           end     = start + diff;
                    StringBuilder wordBuf = new StringBuilder();
                    for (int pos = start; pos < end; pos++)
                    {
                        wordBuf.Append(s[pos]);
                    }
                    string word = wordBuf.ToString();
                    //        for (String tag : POSes) {  // 1.5
                    foreach (string tag in POSes)
                    {
                        IntTaggedWord itw      = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex);
                        double        newScore = lex.Score(itw, 0, word, null) + Math.Log(lex.GetPOSDistribution().ProbabilityOf(tag));
                        if (newScore > scores[start][end])
                        {
                            scores[start][end]         = newScore;
                            splitBacktrace[start][end] = end;
                            POSbacktrace[start][end]   = itw.Tag();
                        }
                    }
                }
            }
            // now fill in word combination probabilities
            for (int diff_1 = 2; diff_1 <= length; diff_1++)
            {
                for (int start = 0; start + diff_1 <= length; start++)
                {
                    int end = start + diff_1;
                    for (int split = start + 1; split < end && split - start <= 10; split++)
                    {
                        if (splitBacktrace[start][split] != split)
                        {
                            continue;
                        }
                        // only consider words on left
                        double newScore = scores[start][split] + scores[split][end];
                        if (newScore > scores[start][end])
                        {
                            scores[start][end]         = newScore;
                            splitBacktrace[start][end] = split;
                        }
                    }
                }
            }
            IList <TaggedWord> words = new List <TaggedWord>();
            int start_1 = 0;

            while (start_1 < length)
            {
                int           end     = splitBacktrace[start_1][length];
                StringBuilder wordBuf = new StringBuilder();
                for (int pos = start_1; pos < end; pos++)
                {
                    wordBuf.Append(s[pos]);
                }
                string word = wordBuf.ToString();
                string tag  = tagIndex.Get(POSbacktrace[start_1][end]);
                words.Add(new TaggedWord(word, tag));
                start_1 = end;
            }
            return(new List <TaggedWord>(words));
        }
Exemple #5
0
        /// <summary>
        /// Get the score of this word with this tag (as an IntTaggedWord) at this
        /// location.
        /// </summary>
        /// <remarks>
        /// Get the score of this word with this tag (as an IntTaggedWord) at this
        /// location. (Presumably an estimate of P(word | tag).)
        /// <p>
        /// <i>Implementation documentation:</i>
        /// Seen:
        /// c_W = count(W)      c_TW = count(T,W)
        /// c_T = count(T)      c_Tunseen = count(T) among new words in 2nd half
        /// total = count(seen words)   totalUnseen = count("unseen" words)
        /// p_T_U = Pmle(T|"unseen")
        /// pb_T_W = P(T|W). If (c_W &gt; smoothInUnknownsThreshold) = c_TW/c_W
        /// Else (if not smart mutation) pb_T_W = bayes prior smooth[1] with p_T_U
        /// p_T= Pmle(T)          p_W = Pmle(W)
        /// pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule]
        /// Note that this doesn't really properly reserve mass to unknowns.
        /// Unseen:
        /// c_TS = count(T,Sig|Unseen)      c_S = count(Sig)   c_T = count(T|Unseen)
        /// c_U = totalUnseen above
        /// p_T_U = Pmle(T|Unseen)
        /// pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]]
        /// pb_W_T = log(P(W|T)) inverted
        /// </remarks>
        /// <param name="iTW">An IntTaggedWord pairing a word and POS tag</param>
        /// <param name="loc">
        /// The position in the sentence. <i>In the default implementation
        /// this is used only for unknown words to change their probability
        /// distribution when sentence initial</i>
        /// </param>
        /// <returns>A float score, usually, log P(word|tag)</returns>
        public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            // both actual
            double c_TW = seenCounter.GetCount(iTW);
            // double x_TW = xferCounter.getCount(iTW);
            IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag);
            // word counts
            double c_W = seenCounter.GetCount(temp);
            // double x_W = xferCounter.getCount(temp);
            // totals
            double total       = seenCounter.GetCount(NullItw);
            double totalUnseen = uwModel.UnSeenCounter().GetCount(NullItw);

            temp = new IntTaggedWord(nullWord, iTW.tag);
            // tag counts
            double c_T       = seenCounter.GetCount(temp);
            double c_Tunseen = uwModel.UnSeenCounter().GetCount(temp);
            double pb_W_T;
            // always set below
            // dump info about last word
            // the 2nd conjunct in test above handles older serialized files
            bool seen = (c_W > 0.0);

            if (seen)
            {
                // known word model for P(T|W)
                // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this]
                // c_TW += 0.5;
                double p_T_U;
                if (useSignatureForKnownSmoothing)
                {
                    // only works for English currently
                    p_T_U = GetUnknownWordModel().ScoreProbTagGivenWordSignature(iTW, loc, smooth[0], word);
                }
                else
                {
                    p_T_U = c_Tunseen / totalUnseen;
                }
                double pb_T_W;
                // always set below
                if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0)
                {
                    // we've seen the word enough times to have confidence in its tagging
                    pb_T_W = c_TW / c_W;
                }
                else
                {
                    // we haven't seen the word enough times to have confidence in its
                    // tagging
                    if (smartMutation)
                    {
                        int numTags = tagIndex.Size();
                        if (m_TT == null || numTags != m_T.Length)
                        {
                            BuildPT_T();
                        }
                        p_T_U *= 0.1;
                        // System.out.println("Checking "+iTW);
                        for (int t = 0; t < numTags; t++)
                        {
                            IntTaggedWord iTW2   = new IntTaggedWord(iTW.word, t);
                            double        p_T_W2 = seenCounter.GetCount(iTW2) / c_W;
                            if (p_T_W2 > 0)
                            {
                                // System.out.println(" Observation of "+tagIndex.get(t)+"
                                // ("+seenCounter.getCount(iTW2)+") mutated to
                                // "+tagIndex.get(iTW.tag)+" at rate
                                // "+(m_TT[tag][t]/m_T[t]));
                                p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9;
                            }
                        }
                    }
                    // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W);
                    pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
                }
                double p_T = (c_T / total);
                double p_W = (c_W / total);
                pb_W_T = Math.Log(pb_T_W * p_W / p_T);
            }
            else
            {
                // debugProbs.append("\n" + "smartMutation=" + smartMutation + "
                // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + "
                // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U
                // + " c_W=" + c_W);
                // end if (DEBUG_LEXICON)
                // when unseen
                if (loc >= 0)
                {
                    pb_W_T = GetUnknownWordModel().Score(iTW, loc, c_T, total, smooth[0], word);
                }
                else
                {
                    // For negative we now do a weighted average for the dependency grammar :-)
                    double pb_W0_T = GetUnknownWordModel().Score(iTW, 0, c_T, total, smooth[0], word);
                    double pb_W1_T = GetUnknownWordModel().Score(iTW, 1, c_T, total, smooth[0], word);
                    pb_W_T = Math.Log((Math.Exp(pb_W0_T) + 2 * Math.Exp(pb_W1_T)) / 3);
                }
            }
            string tag = tagIndex.Get(iTW.Tag());

            // Categorical cutoff if score is too low
            if (pb_W_T > -100.0)
            {
                return((float)pb_W_T);
            }
            return(float.NegativeInfinity);
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }