public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            int wordId = iTW.Word();
            int tagId  = iTW.Tag();
            // Force 1-best path to go through the boundary symbol
            // (deterministic tagging)
            int boundaryId    = wordIndex.IndexOf(LexiconConstants.Boundary);
            int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag);

            if (wordId == boundaryId && tagId == boundaryTagId)
            {
                return(0.0f);
            }
            // Morphological features
            string tag = tagIndex.Get(iTW.Tag());
            Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec);
            string lemma           = lemmaMorph.First();
            int    lemmaId         = wordIndex.IndexOf(lemma);
            string richMorphTag    = lemmaMorph.Second();
            string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();

            reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag;
            int morphId = morphIndex.AddToIndex(reducedMorphTag);
            // Score the factors and create the rule score p_W_T
            double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId));
            //    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
            double p_L_T = 0.0;
            double p_M_T = Math.Log(ProbMorphTag(tagId, morphId));
            double p_W_T = p_W_Tf + p_L_T + p_M_T;

            //      String tag = tagIndex.get(tagId);
            // Filter low probability taggings
            return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity);
        }
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            // scan data
            string word      = tw.Word();
            string subString = model.GetSignature(word, loc);
            ILabel tag       = new Tag(tw.Tag());

            if (!c.Contains(tag))
            {
                c[tag] = new ClassicCounter <string>();
            }
            c[tag].IncrementCount(subString, weight);
            tc.IncrementCount(tag, weight);
            seenEnd.Add(subString);
            string        tagStr = tw.Tag();
            IntTaggedWord iW     = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight);
                }
            }
        }
Esempio n. 3
0
        // if (itw.tag() == nullTag) {
        // sigs.add(itw);
        // }
        /// <summary>
        /// This records how likely it is for a word with one tag to also have another
        /// tag.
        /// </summary>
        /// <remarks>
        /// This records how likely it is for a word with one tag to also have another
        /// tag. This won't work after serialization/deserialization, but that is how
        /// it is currently called....
        /// </remarks>
        internal virtual void BuildPT_T()
        {
            int numTags = tagIndex.Size();

            m_TT = new double[numTags][];
            m_T  = new double[numTags];
            double[] tmp = new double[numTags];
            foreach (IntTaggedWord word in words)
            {
                double tot = 0.0;
                for (int t = 0; t < numTags; t++)
                {
                    IntTaggedWord iTW = new IntTaggedWord(word.word, t);
                    tmp[t] = seenCounter.GetCount(iTW);
                    tot   += tmp[t];
                }
                if (tot < 10)
                {
                    continue;
                }
                for (int t_1 = 0; t_1 < numTags; t_1++)
                {
                    for (int t2 = 0; t2 < numTags; t2++)
                    {
                        if (tmp[t2] > 0.0)
                        {
                            double c = tmp[t_1] / tot;
                            m_T[t_1]      += c;
                            m_TT[t2][t_1] += c;
                        }
                    }
                }
            }
        }
Esempio n. 4
0
        public virtual void Train(TaggedWord tw, int loc, double weight)
        {
            uwModelTrainer.Train(tw, loc, weight);
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);

            seenCounter.IncrementCount(iTW, weight);
            IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);

            seenCounter.IncrementCount(iT, weight);
            IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);

            seenCounter.IncrementCount(i, weight);
            // rules.add(iTW);
            tags.Add(iT);
            words.Add(iW);
            string            tag     = tw.Tag();
            string            baseTag = op.Langpack().BasicCategory(tag);
            ICounter <string> counts  = baseTagCounts[baseTag];

            if (counts == null)
            {
                counts = new ClassicCounter <string>();
                baseTagCounts[baseTag] = counts;
            }
            counts.IncrementCount(tag, weight);
        }
Esempio n. 5
0
        /// <summary>Trains this lexicon on the Collection of trees.</summary>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);
            IntTaggedWord iT  = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag);
            IntTaggedWord iW  = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw;

            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    // it's an entirely unknown word
                    int           s   = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word));
                    IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
                    IntTaggedWord iS  = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag);
                    unSeenCounter.IncrementCount(iTS, weight);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iS, weight);
                    unSeenCounter.IncrementCount(i, weight);
                }
            }
        }
 public IntDependency(int headWord, int headTag, int argWord, int argTag, bool leftHeaded, int distance)
 {
     this.head       = new IntTaggedWord(headWord, headTag);
     this.arg        = new IntTaggedWord(argWord, argTag);
     this.distance   = (short)distance;
     this.leftHeaded = leftHeaded;
 }
 public IntDependency(IntTaggedWord head, IntTaggedWord arg, bool leftHeaded, int distance)
 {
     this.head       = head;
     this.arg        = arg;
     this.distance   = (short)distance;
     this.leftHeaded = leftHeaded;
 }
        public override float Score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, string word)
        {
            double pb_W_T;
            // always set below
            //  unknown word model for P(T|S)
            int           wordSig = GetSignatureIndex(iTW.word, loc, word);
            IntTaggedWord temp    = new IntTaggedWord(wordSig, iTW.tag);
            double        c_TS    = unSeenCounter.GetCount(temp);

            temp = new IntTaggedWord(wordSig, nullTag);
            double c_S = unSeenCounter.GetCount(temp);
            double c_U = unSeenCounter.GetCount(NullItw);

            temp = new IntTaggedWord(nullWord, iTW.tag);
            double c_T   = unSeenCounter.GetCount(temp);
            double p_T_U = c_T / c_U;

            if (unknownLevel == 0)
            {
                c_TS = 0;
                c_S  = 0;
            }
            double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth);
            double p_T    = (c_Tseen / total);
            double p_W    = 1.0 / total;

            pb_W_T = Math.Log(pb_T_S * p_W / p_T);
            return((float)pb_W_T);
        }
        // end score()
        /// <summary>Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown)</summary>
        public override double ScoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, string word)
        {
            // iTW.tag = nullTag;
            // double c_W = ((BaseLexicon) l).getCount(iTW);
            // iTW.tag = tag;
            // unknown word model for P(T|S)
            int           wordSig = GetSignatureIndex(iTW.word, loc, word);
            IntTaggedWord temp    = new IntTaggedWord(wordSig, iTW.tag);
            double        c_TS    = unSeenCounter.GetCount(temp);

            temp = new IntTaggedWord(wordSig, nullTag);
            double c_S = unSeenCounter.GetCount(temp);
            double c_U = unSeenCounter.GetCount(NullItw);

            temp = new IntTaggedWord(nullWord, iTW.tag);
            double c_T   = unSeenCounter.GetCount(temp);
            double p_T_U = c_T / c_U;

            if (unknownLevel == 0)
            {
                c_TS = 0;
                c_S  = 0;
            }
            return((c_TS + smooth * p_T_U) / (c_S + smooth));
        }
Esempio n. 10
0
 public virtual bool RootTW(IntTaggedWord rTW)
 {
     // System.out.println("rootTW: checking if " + rTW.toString("verbose") +
     // " == " + Lexicon.BOUNDARY_TAG + "[" +
     // tagIndex.indexOf(Lexicon.BOUNDARY_TAG) + "]" + ": " +
     // (rTW.tag == tagIndex.indexOf(Lexicon.BOUNDARY_TAG)));
     return(rTW.tag == tagIndex.IndexOf(LexiconConstants.BoundaryTag));
 }
        public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            string tag = tagIndex.Get(iTW.tag);

            System.Diagnostics.Debug.Assert(!word.Equals(LexiconConstants.Boundary));
            char[] chars = word.ToCharArray();
            IList <ISerializable> charList = new List <ISerializable>(chars.Length + ContextLength + 1);

            // this starts of storing Symbol's and then starts storing String's. Clean this up someday!
            // charList is constructed backward
            // END_WORD char[length-1] char[length-2] ... char[0] BEGIN_WORD BEGIN_WORD
            charList.Add(ChineseCharacterBasedLexicon.Symbol.EndWord);
            for (int i = chars.Length - 1; i >= 0; i--)
            {
                ChineseCharacterBasedLexicon.Symbol ch = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(chars[i]);
                if (knownChars.Contains(ch))
                {
                    charList.Add(ch);
                }
                else
                {
                    charList.Add(UnknownCharClass(ch));
                }
            }
            for (int i_1 = 0; i_1 < ContextLength; i_1++)
            {
                charList.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord);
            }
            double score = 0.0;

            for (int i_2 = 0; i_2 < size - ContextLength; i_2++)
            {
                ChineseCharacterBasedLexicon.Symbol nextChar = (ChineseCharacterBasedLexicon.Symbol)charList[i_2];
                charList.Set(i_2, tag);
                double charScore = GetBackedOffDist(charList.SubList(i_2, i_2 + ContextLength + 1)).ProbabilityOf(nextChar);
                score += Math.Log(charScore);
            }
            switch (penaltyType)
            {
            case 0:
            {
                break;
            }

            case 1:
            {
                score -= (chars.Length * (chars.Length + 1)) * (lengthPenalty / 2);
                break;
            }

            case 2:
            {
                score -= (chars.Length - 1) * lengthPenalty;
                break;
            }
            }
            return((float)score);
        }
        /// <summary>Rule table is lemmas!</summary>
        protected internal override void InitRulesWithWord()
        {
            // Add synthetic symbols to the indices
            int unkWord        = wordIndex.AddToIndex(LexiconConstants.UnknownWord);
            int boundaryWordId = wordIndex.AddToIndex(LexiconConstants.Boundary);
            int boundaryTagId  = tagIndex.AddToIndex(LexiconConstants.BoundaryTag);
            // Initialize rules table
            int numWords = wordIndex.Size();

            rulesWithWord = new IList[numWords];
            for (int w = 0; w < numWords; w++)
            {
                rulesWithWord[w] = new List <IntTaggedWord>(1);
            }
            // Collect rules, indexed by word
            ICollection <IntTaggedWord> lexRules = Generics.NewHashSet(40000);

            foreach (int wordId in wordTag.FirstKeySet())
            {
                foreach (int tagId in wordTag.GetCounter(wordId).KeySet())
                {
                    lexRules.Add(new IntTaggedWord(wordId, tagId));
                    lexRules.Add(new IntTaggedWord(nullWord, tagId));
                }
            }
            // Known words and signatures
            foreach (IntTaggedWord iTW in lexRules)
            {
                if (iTW.Word() == nullWord)
                {
                    // Mix in UW signature rules for open class types
                    double types = uwModel.UnSeenCounter().GetCount(iTW);
                    if (types > trainOptions.openClassTypesThreshold)
                    {
                        IntTaggedWord iTU = new IntTaggedWord(unkWord, iTW.tag);
                        if (!rulesWithWord[unkWord].Contains(iTU))
                        {
                            rulesWithWord[unkWord].Add(iTU);
                        }
                    }
                }
                else
                {
                    // Known word
                    rulesWithWord[iTW.word].Add(iTW);
                }
            }
            log.Info("The " + rulesWithWord[unkWord].Count + " open class tags are: [");
            foreach (IntTaggedWord item in rulesWithWord[unkWord])
            {
                log.Info(" " + tagIndex.Get(item.Tag()));
            }
            log.Info(" ] ");
            // Boundary symbol has one tagging
            rulesWithWord[boundaryWordId].Add(new IntTaggedWord(boundaryWordId, boundaryTagId));
        }
Esempio n. 13
0
        protected internal virtual void InitRulesWithWord()
        {
            if (testOptions.verbose || DebugLexicon)
            {
                log.Info("Initializing lexicon scores ... ");
            }
            // int numWords = words.size()+sigs.size()+1;
            int unkWord  = wordIndex.AddToIndex(LexiconConstants.UnknownWord);
            int numWords = wordIndex.Size();

            rulesWithWord = new IList[numWords];
            for (int w = 0; w < numWords; w++)
            {
                rulesWithWord[w] = new List <IntTaggedWord>(1);
            }
            // most have 1 or 2
            // items in them
            // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) {
            tags = Generics.NewHashSet();
            foreach (IntTaggedWord iTW in seenCounter.KeySet())
            {
                if (iTW.Word() == nullWord && iTW.Tag() != nullTag)
                {
                    tags.Add(iTW);
                }
            }
            // tags for unknown words
            foreach (IntTaggedWord iT in tags)
            {
                double types = uwModel.UnSeenCounter().GetCount(iT);
                if (types > trainOptions.openClassTypesThreshold)
                {
                    // Number of types before it's treated as open class
                    IntTaggedWord iTW_1 = new IntTaggedWord(unkWord, iT.tag);
                    rulesWithWord[iTW_1.word].Add(iTW_1);
                }
            }
            if (testOptions.verbose || DebugLexicon)
            {
                StringBuilder sb = new StringBuilder();
                sb.Append("The ").Append(rulesWithWord[unkWord].Count).Append(" open class tags are: [");
                foreach (IntTaggedWord item in rulesWithWord[unkWord])
                {
                    sb.Append(' ').Append(tagIndex.Get(item.Tag()));
                }
                sb.Append(" ]");
                log.Info(sb.ToString());
            }
            foreach (IntTaggedWord iTW_2 in seenCounter.KeySet())
            {
                if (iTW_2.Tag() != nullTag && iTW_2.Word() != nullWord)
                {
                    rulesWithWord[iTW_2.word].Add(iTW_2);
                }
            }
        }
Esempio n. 14
0
        /// <summary>Checks whether a word is in the lexicon.</summary>
        /// <remarks>
        /// Checks whether a word is in the lexicon. This version works even while
        /// compiling lexicon with current counters (rather than using the compiled
        /// rulesWithWord array).
        /// TODO: The previous version would insert rules into the
        /// wordNumberer.  Is that the desired behavior?  Why not test in
        /// some way that doesn't affect the index?  For example, start by
        /// testing wordIndex.contains(word).
        /// </remarks>
        /// <param name="word">The word as a String</param>
        /// <returns>Whether the word is in the lexicon</returns>
        public virtual bool IsKnown(string word)
        {
            if (!wordIndex.Contains(word))
            {
                return(false);
            }
            IntTaggedWord iW = new IntTaggedWord(wordIndex.IndexOf(word), nullTag);

            return(seenCounter.GetCount(iW) > 0.0);
        }
Esempio n. 15
0
 /// <summary>Adds the tagging with count to the data structures in this Lexicon.</summary>
 public virtual void AddTagging(bool seen, IntTaggedWord itw, double count)
 {
     if (seen)
     {
         log.Info("UWM.addTagging: Shouldn't call with seen word!");
     }
     else
     {
         unSeenCounter.IncrementCount(itw, count);
     }
 }
Esempio n. 16
0
        protected internal virtual IList <IntTaggedWord> ListToEvents(IList <TaggedWord> taggedWords)
        {
            IList <IntTaggedWord> itwList = new List <IntTaggedWord>();

            foreach (TaggedWord tw in taggedWords)
            {
                IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);
                itwList.Add(iTW);
            }
            return(itwList);
        }
 public virtual bool PruneTW(IntTaggedWord argTW)
 {
     string[] punctTags = tlp.PunctuationTags();
     foreach (string punctTag in punctTags)
     {
         if (argTW.tag == tagIndex.IndexOf(punctTag))
         {
             return(true);
         }
     }
     return(false);
 }
        public override float Score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, string word)
        {
            double pb_T_S = ScoreProbTagGivenWordSignature(iTW, loc, smooth, word);
            double p_T    = (c_Tseen / total);
            double p_W    = 1.0 / total;
            double pb_W_T = Math.Log(pb_T_S * p_W / p_T);

            if (pb_W_T > -100.0)
            {
                return((float)pb_W_T);
            }
            return(float.NegativeInfinity);
        }
Esempio n. 19
0
 public AbstractDependencyGrammar(ITreebankLanguagePack tlp, ITagProjection tagProjection, bool directional, bool useDistance, bool useCoarseDistance, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     this.tlp               = tlp;
     this.tagProjection     = tagProjection;
     this.directional       = directional;
     this.useDistance       = useDistance;
     this.useCoarseDistance = useCoarseDistance;
     this.op        = op;
     this.wordIndex = wordIndex;
     this.tagIndex  = tagIndex;
     stopTW         = new IntTaggedWord(IntTaggedWord.StopWordInt, IntTaggedWord.StopTagInt);
     wildTW         = new IntTaggedWord(IntTaggedWord.AnyWordInt, IntTaggedWord.AnyTagInt);
     InitTagBins();
 }
Esempio n. 20
0
        /// <summary>Generate the possible taggings for a word at a sentence position.</summary>
        /// <remarks>
        /// Generate the possible taggings for a word at a sentence position.
        /// This may either be based on a strict lexicon or an expanded generous
        /// set of possible taggings. <p>
        /// <i>Implementation note:</i> Expanded sets of possible taggings are
        /// calculated dynamically at runtime, so as to reduce the memory used by
        /// the lexicon (a space/time tradeoff).
        /// </remarks>
        /// <param name="word">The word (as an int)</param>
        /// <param name="loc">Its index in the sentence (usually only relevant for unknown words)</param>
        /// <returns>A list of possible taggings</returns>
        public virtual IEnumerator <IntTaggedWord> RuleIteratorByWord(int word, int loc, string featureSpec)
        {
            // if (rulesWithWord == null) { // tested in isKnown already
            // initRulesWithWord();
            // }
            IList <IntTaggedWord> wordTaggings;

            if (IsKnown(word))
            {
                if (!flexiTag)
                {
                    // Strict lexical tagging for seen items
                    wordTaggings = rulesWithWord[word];
                }
                else
                {
                    /* Allow all tags with same basicCategory */
                    /* Allow all scored taggings, unless very common */
                    IntTaggedWord iW = new IntTaggedWord(word, nullTag);
                    if (seenCounter.GetCount(iW) > smoothInUnknownsThreshold)
                    {
                        return(rulesWithWord[word].GetEnumerator());
                    }
                    else
                    {
                        // give it flexible tagging not just lexicon
                        wordTaggings = new List <IntTaggedWord>(40);
                        foreach (IntTaggedWord iTW2 in tags)
                        {
                            IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag);
                            if (Score(iTW, loc, wordIndex.Get(word), null) > float.NegativeInfinity)
                            {
                                wordTaggings.Add(iTW);
                            }
                        }
                    }
                }
            }
            else
            {
                // we copy list so we can insert correct word in each item
                wordTaggings = new List <IntTaggedWord>(40);
                foreach (IntTaggedWord iTW in rulesWithWord[wordIndex.IndexOf(LexiconConstants.UnknownWord)])
                {
                    wordTaggings.Add(new IntTaggedWord(word, iTW.tag));
                }
            }
            return(wordTaggings.GetEnumerator());
        }
Esempio n. 21
0
        public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            EnsureProbs(iTW.Word());
            double max   = Counters.Max(logProbs);
            double score = logProbs.GetCount(iTW.TagString(tagIndex));

            if (score > max - iteratorCutoffFactor)
            {
                return((float)score);
            }
            else
            {
                return(float.NegativeInfinity);
            }
        }
Esempio n. 22
0
        /// <summary>
        /// This is a custom interner that simultaneously creates and interns
        /// an IntDependency.
        /// </summary>
        /// <returns>An interned IntDependency</returns>
        protected internal virtual IntDependency Intern(IntTaggedWord headTW, IntTaggedWord argTW, bool leftHeaded, short dist)
        {
            IDictionary <IntDependency, IntDependency> map = expandDependencyMap;
            IntDependency internTempDependency             = new IntDependency(itwInterner.Intern(headTW), itwInterner.Intern(argTW), leftHeaded, dist);
            IntDependency returnDependency = internTempDependency;

            if (map != null)
            {
                returnDependency = map[internTempDependency];
                if (returnDependency == null)
                {
                    map[internTempDependency] = internTempDependency;
                    returnDependency          = internTempDependency;
                }
            }
            return(returnDependency);
        }
Esempio n. 23
0
        public virtual IEnumerator <IntTaggedWord> RuleIteratorByWord(int word, int loc, string featureSpec)
        {
            EnsureProbs(word);
            IList <IntTaggedWord> rules = new List <IntTaggedWord>();
            double max = Counters.Max(logProbs);

            for (int tag = 0; tag < tagIndex.Size(); tag++)
            {
                IntTaggedWord iTW   = new IntTaggedWord(word, tag);
                double        score = logProbs.GetCount(tagIndex.Get(tag));
                if (score > max - iteratorCutoffFactor)
                {
                    rules.Add(iTW);
                }
            }
            return(rules.GetEnumerator());
        }
        /// <summary>Trains the first-character based unknown word model.</summary>
        /// <param name="tw">The word we are currently training on</param>
        /// <param name="loc">The position of that word</param>
        /// <param name="weight">The weight to give this word in terms of training</param>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            string word  = tw.Word();
            ILabel tagL  = new Tag(tw.Tag());
            string first = Sharpen.Runtime.Substring(word, 0, 1);

            if (useUnicodeType)
            {
                char ch   = word[0];
                int  type = char.GetType(ch);
                if (type != char.OtherLetter)
                {
                    // standard Chinese characters are of type "OTHER_LETTER"!!
                    first = int.ToString(type);
                }
            }
            string tag = tw.Tag();

            if (!c.Contains(tagL))
            {
                c[tagL] = new ClassicCounter <string>();
            }
            c[tagL].IncrementCount(first, weight);
            tc.IncrementCount(tagL, weight);
            seenFirst.Add(first);
            IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iTotal, weight);
                }
            }
        }
        /// <summary>
        /// Populates data in this DependencyGrammar from the character stream
        /// given by the Reader r.
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public override void ReadData(BufferedReader @in)
        {
            string Left    = "left";
            int    lineNum = 1;
            // all lines have one rule per line
            bool doingStop = false;

            for (string line = @in.ReadLine(); line != null && line.Length > 0; line = @in.ReadLine())
            {
                try
                {
                    if (line.Equals("BEGIN_STOP"))
                    {
                        doingStop = true;
                        continue;
                    }
                    string[] fields = StringUtils.SplitOnCharWithQuoting(line, ' ', '\"', '\\');
                    // split on spaces, quote with doublequote, and escape with backslash
                    //        System.out.println("fields:\n" + fields[0] + "\n" + fields[1] + "\n" + fields[2] + "\n" + fields[3] + "\n" + fields[4] + "\n" + fields[5]);
                    short         distance       = (short)System.Convert.ToInt32(fields[4]);
                    IntTaggedWord tempHead       = new IntTaggedWord(fields[0], '/', wordIndex, tagIndex);
                    IntTaggedWord tempArg        = new IntTaggedWord(fields[2], '/', wordIndex, tagIndex);
                    IntDependency tempDependency = new IntDependency(tempHead, tempArg, fields[3].Equals(Left), distance);
                    double        count          = double.Parse(fields[5]);
                    if (doingStop)
                    {
                        ExpandStop(tempDependency, distance, count, false);
                    }
                    else
                    {
                        ExpandArg(tempDependency, distance, count);
                    }
                }
                catch (Exception e)
                {
                    IOException ioe = new IOException("Error on line " + lineNum + ": " + line);
                    ioe.InitCause(e);
                    throw ioe;
                }
                //      System.out.println("read line " + lineNum + ": " + line);
                lineNum++;
            }
        }
        private void ExpandStop(IntDependency dependency, short distBinDist, double count, bool wildForStop)
        {
            IntTaggedWord headT = GetCachedITW(dependency.head.tag);
            IntTaggedWord head  = new IntTaggedWord(dependency.head.word, TagBin(dependency.head.tag));
            //dependency.head;
            IntTaggedWord arg = new IntTaggedWord(dependency.arg.word, TagBin(dependency.arg.tag));
            //dependency.arg;
            bool leftHeaded = dependency.leftHeaded;

            if (arg.word == IntTaggedWord.StopWordInt)
            {
                stopCounter.IncrementCount(Intern(head, arg, leftHeaded, distBinDist), count);
                stopCounter.IncrementCount(Intern(headT, arg, leftHeaded, distBinDist), count);
            }
            if (wildForStop || arg.word != IntTaggedWord.StopWordInt)
            {
                stopCounter.IncrementCount(Intern(head, wildTW, leftHeaded, distBinDist), count);
                stopCounter.IncrementCount(Intern(headT, wildTW, leftHeaded, distBinDist), count);
            }
        }
        // log.info("stopCounter: " + stopCounter);
        // log.info("argCounter: " + argCounter);
        //new ArrayList();
        /// <summary>
        /// This maps from a tag to a cached IntTagWord that represents the
        /// tag by having the wildcard word ANY_WORD_INT and  the tag in the
        /// reduced tag space.
        /// </summary>
        /// <remarks>
        /// This maps from a tag to a cached IntTagWord that represents the
        /// tag by having the wildcard word ANY_WORD_INT and  the tag in the
        /// reduced tag space.
        /// The argument is in terms of the full tag space; internally this
        /// function maps to the reduced space.
        /// </remarks>
        /// <param name="tag">short representation of tag in full tag space</param>
        /// <returns>an IntTaggedWord in the reduced tag space</returns>
        private IntTaggedWord GetCachedITW(short tag)
        {
            // The +2 below is because -1 and -2 are used with special meanings (see IntTaggedWord).
            if (tagITWList == null)
            {
                tagITWList = new List <IntTaggedWord>(numTagBins + 2);
                for (int i = 0; i < numTagBins + 2; i++)
                {
                    tagITWList.Add(i, null);
                }
            }
            IntTaggedWord headT = tagITWList[TagBin(tag) + 2];

            if (headT == null)
            {
                headT = new IntTaggedWord(IntTaggedWord.AnyWordInt, TagBin(tag));
                tagITWList.Set(TagBin(tag) + 2, headT);
            }
            return(headT);
        }
Esempio n. 28
0
 public override IUnknownWordModel FinishTraining()
 {
     // make sure the unseen counter isn't empty!  If it is, put in
     // a uniform unseen over tags
     if (unSeenCounter.IsEmpty())
     {
         System.Console.Error.Printf("%s: WARNING: Unseen word counter is empty!", this.GetType().FullName);
         int numTags = tagIndex.Size();
         for (int tt = 0; tt < numTags; tt++)
         {
             if (!BoundaryTag.Equals(tagIndex.Get(tt)))
             {
                 IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, tt);
                 IntTaggedWord i  = UnknownWordModelTrainerConstants.NullItw;
                 unSeenCounter.IncrementCount(iT);
                 unSeenCounter.IncrementCount(i);
             }
         }
     }
     return(model);
 }
Esempio n. 29
0
        // if (useMaxentUnknownWordModel) {
        //  cml = new ChineseMaxentLexicon();
        // } else {
        //unknown = new ChineseUnknownWordModel();
        //this.setUnknownWordModel(new ChineseUnknownWordModel(op));
        // this.getUnknownWordModel().setLexicon(this);
        // }
        public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            double c_W  = seenCounter.GetCount(iTW);
            bool   seen = (c_W > 0.0);

            if (seen)
            {
                return(base.Score(iTW, loc, word, featureSpec));
            }
            else
            {
                float score;
                // if (useMaxentUnknownWordModel) {
                //  score = cml.score(iTW, 0);
                // } else {
                score = this.GetUnknownWordModel().Score(iTW, loc, 0.0, 0.0, 0.0, word);
                // ChineseUnknownWordModel doesn't use the final three params
                // }
                return(score);
            }
        }
        /// <summary>
        /// Return the probability (as a real number between 0 and 1) of stopping
        /// rather than generating another argument at this position.
        /// </summary>
        /// <param name="dependency">
        /// The dependency used as the basis for stopping on.
        /// Tags are assumed to be in the TagProjection space.
        /// </param>
        /// <returns>The probability of generating this stop probability</returns>
        protected internal virtual double GetStopProb(IntDependency dependency)
        {
            short         binDistance  = DistanceBin(dependency.distance);
            IntTaggedWord unknownHead  = new IntTaggedWord(-1, dependency.head.tag);
            IntTaggedWord anyHead      = new IntTaggedWord(IntTaggedWord.AnyWordInt, dependency.head.tag);
            IntDependency temp         = new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance);
            double        c_stop_hTWds = stopCounter.GetCount(temp);

            temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance);
            double c_stop_hTds = stopCounter.GetCount(temp);

            temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance);
            double c_hTWds = stopCounter.GetCount(temp);

            temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance);
            double c_hTds        = stopCounter.GetCount(temp);
            double p_stop_hTds   = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0);
            double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop);

            return(pb_stop_hTWds);
        }