public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { int wordId = iTW.Word(); int tagId = iTW.Tag(); // Force 1-best path to go through the boundary symbol // (deterministic tagging) int boundaryId = wordIndex.IndexOf(LexiconConstants.Boundary); int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag); if (wordId == boundaryId && tagId == boundaryTagId) { return(0.0f); } // Morphological features string tag = tagIndex.Get(iTW.Tag()); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec); string lemma = lemmaMorph.First(); int lemmaId = wordIndex.IndexOf(lemma); string richMorphTag = lemmaMorph.Second(); string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim(); reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag; int morphId = morphIndex.AddToIndex(reducedMorphTag); // Score the factors and create the rule score p_W_T double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId)); // double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId)); double p_L_T = 0.0; double p_M_T = Math.Log(ProbMorphTag(tagId, morphId)); double p_W_T = p_W_Tf + p_L_T + p_M_T; // String tag = tagIndex.get(tagId); // Filter low probability taggings return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity); }
public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } // scan data string word = tw.Word(); string subString = model.GetSignature(word, loc); ILabel tag = new Tag(tw.Tag()); if (!c.Contains(tag)) { c[tag] = new ClassicCounter <string>(); } c[tag].IncrementCount(subString, weight); tc.IncrementCount(tag, weight); seenEnd.Add(subString); string tagStr = tw.Tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight); } } }
// if (itw.tag() == nullTag) { // sigs.add(itw); // } /// <summary> /// This records how likely it is for a word with one tag to also have another /// tag. /// </summary> /// <remarks> /// This records how likely it is for a word with one tag to also have another /// tag. This won't work after serialization/deserialization, but that is how /// it is currently called.... /// </remarks> internal virtual void BuildPT_T() { int numTags = tagIndex.Size(); m_TT = new double[numTags][]; m_T = new double[numTags]; double[] tmp = new double[numTags]; foreach (IntTaggedWord word in words) { double tot = 0.0; for (int t = 0; t < numTags; t++) { IntTaggedWord iTW = new IntTaggedWord(word.word, t); tmp[t] = seenCounter.GetCount(iTW); tot += tmp[t]; } if (tot < 10) { continue; } for (int t_1 = 0; t_1 < numTags; t_1++) { for (int t2 = 0; t2 < numTags; t2++) { if (tmp[t2] > 0.0) { double c = tmp[t_1] / tot; m_T[t_1] += c; m_TT[t2][t_1] += c; } } } } }
public virtual void Train(TaggedWord tw, int loc, double weight) { uwModelTrainer.Train(tw, loc, weight); IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); seenCounter.IncrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.IncrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.IncrementCount(i, weight); // rules.add(iTW); tags.Add(iT); words.Add(iW); string tag = tw.Tag(); string baseTag = op.Langpack().BasicCategory(tag); ICounter <string> counts = baseTagCounts[baseTag]; if (counts == null) { counts = new ClassicCounter <string>(); baseTagCounts[baseTag] = counts; } counts.IncrementCount(tag, weight); }
/// <summary>Trains this lexicon on the Collection of trees.</summary> public override void Train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { // it's an entirely unknown word int s = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag); unSeenCounter.IncrementCount(iTS, weight); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iS, weight); unSeenCounter.IncrementCount(i, weight); } } }
public IntDependency(int headWord, int headTag, int argWord, int argTag, bool leftHeaded, int distance) { this.head = new IntTaggedWord(headWord, headTag); this.arg = new IntTaggedWord(argWord, argTag); this.distance = (short)distance; this.leftHeaded = leftHeaded; }
public IntDependency(IntTaggedWord head, IntTaggedWord arg, bool leftHeaded, int distance) { this.head = head; this.arg = arg; this.distance = (short)distance; this.leftHeaded = leftHeaded; }
public override float Score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, string word) { double pb_W_T; // always set below // unknown word model for P(T|S) int wordSig = GetSignatureIndex(iTW.word, loc, word); IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag); double c_TS = unSeenCounter.GetCount(temp); temp = new IntTaggedWord(wordSig, nullTag); double c_S = unSeenCounter.GetCount(temp); double c_U = unSeenCounter.GetCount(NullItw); temp = new IntTaggedWord(nullWord, iTW.tag); double c_T = unSeenCounter.GetCount(temp); double p_T_U = c_T / c_U; if (unknownLevel == 0) { c_TS = 0; c_S = 0; } double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth); double p_T = (c_Tseen / total); double p_W = 1.0 / total; pb_W_T = Math.Log(pb_T_S * p_W / p_T); return((float)pb_W_T); }
// end score() /// <summary>Calculate P(Tag|Signature) with Bayesian smoothing via just P(Tag|Unknown)</summary> public override double ScoreProbTagGivenWordSignature(IntTaggedWord iTW, int loc, double smooth, string word) { // iTW.tag = nullTag; // double c_W = ((BaseLexicon) l).getCount(iTW); // iTW.tag = tag; // unknown word model for P(T|S) int wordSig = GetSignatureIndex(iTW.word, loc, word); IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag); double c_TS = unSeenCounter.GetCount(temp); temp = new IntTaggedWord(wordSig, nullTag); double c_S = unSeenCounter.GetCount(temp); double c_U = unSeenCounter.GetCount(NullItw); temp = new IntTaggedWord(nullWord, iTW.tag); double c_T = unSeenCounter.GetCount(temp); double p_T_U = c_T / c_U; if (unknownLevel == 0) { c_TS = 0; c_S = 0; } return((c_TS + smooth * p_T_U) / (c_S + smooth)); }
public virtual bool RootTW(IntTaggedWord rTW) { // System.out.println("rootTW: checking if " + rTW.toString("verbose") + // " == " + Lexicon.BOUNDARY_TAG + "[" + // tagIndex.indexOf(Lexicon.BOUNDARY_TAG) + "]" + ": " + // (rTW.tag == tagIndex.indexOf(Lexicon.BOUNDARY_TAG))); return(rTW.tag == tagIndex.IndexOf(LexiconConstants.BoundaryTag)); }
public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { string tag = tagIndex.Get(iTW.tag); System.Diagnostics.Debug.Assert(!word.Equals(LexiconConstants.Boundary)); char[] chars = word.ToCharArray(); IList <ISerializable> charList = new List <ISerializable>(chars.Length + ContextLength + 1); // this starts of storing Symbol's and then starts storing String's. Clean this up someday! // charList is constructed backward // END_WORD char[length-1] char[length-2] ... char[0] BEGIN_WORD BEGIN_WORD charList.Add(ChineseCharacterBasedLexicon.Symbol.EndWord); for (int i = chars.Length - 1; i >= 0; i--) { ChineseCharacterBasedLexicon.Symbol ch = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(chars[i]); if (knownChars.Contains(ch)) { charList.Add(ch); } else { charList.Add(UnknownCharClass(ch)); } } for (int i_1 = 0; i_1 < ContextLength; i_1++) { charList.Add(ChineseCharacterBasedLexicon.Symbol.BeginWord); } double score = 0.0; for (int i_2 = 0; i_2 < size - ContextLength; i_2++) { ChineseCharacterBasedLexicon.Symbol nextChar = (ChineseCharacterBasedLexicon.Symbol)charList[i_2]; charList.Set(i_2, tag); double charScore = GetBackedOffDist(charList.SubList(i_2, i_2 + ContextLength + 1)).ProbabilityOf(nextChar); score += Math.Log(charScore); } switch (penaltyType) { case 0: { break; } case 1: { score -= (chars.Length * (chars.Length + 1)) * (lengthPenalty / 2); break; } case 2: { score -= (chars.Length - 1) * lengthPenalty; break; } } return((float)score); }
/// <summary>Rule table is lemmas!</summary> protected internal override void InitRulesWithWord() { // Add synthetic symbols to the indices int unkWord = wordIndex.AddToIndex(LexiconConstants.UnknownWord); int boundaryWordId = wordIndex.AddToIndex(LexiconConstants.Boundary); int boundaryTagId = tagIndex.AddToIndex(LexiconConstants.BoundaryTag); // Initialize rules table int numWords = wordIndex.Size(); rulesWithWord = new IList[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new List <IntTaggedWord>(1); } // Collect rules, indexed by word ICollection <IntTaggedWord> lexRules = Generics.NewHashSet(40000); foreach (int wordId in wordTag.FirstKeySet()) { foreach (int tagId in wordTag.GetCounter(wordId).KeySet()) { lexRules.Add(new IntTaggedWord(wordId, tagId)); lexRules.Add(new IntTaggedWord(nullWord, tagId)); } } // Known words and signatures foreach (IntTaggedWord iTW in lexRules) { if (iTW.Word() == nullWord) { // Mix in UW signature rules for open class types double types = uwModel.UnSeenCounter().GetCount(iTW); if (types > trainOptions.openClassTypesThreshold) { IntTaggedWord iTU = new IntTaggedWord(unkWord, iTW.tag); if (!rulesWithWord[unkWord].Contains(iTU)) { rulesWithWord[unkWord].Add(iTU); } } } else { // Known word rulesWithWord[iTW.word].Add(iTW); } } log.Info("The " + rulesWithWord[unkWord].Count + " open class tags are: ["); foreach (IntTaggedWord item in rulesWithWord[unkWord]) { log.Info(" " + tagIndex.Get(item.Tag())); } log.Info(" ] "); // Boundary symbol has one tagging rulesWithWord[boundaryWordId].Add(new IntTaggedWord(boundaryWordId, boundaryTagId)); }
protected internal virtual void InitRulesWithWord() { if (testOptions.verbose || DebugLexicon) { log.Info("Initializing lexicon scores ... "); } // int numWords = words.size()+sigs.size()+1; int unkWord = wordIndex.AddToIndex(LexiconConstants.UnknownWord); int numWords = wordIndex.Size(); rulesWithWord = new IList[numWords]; for (int w = 0; w < numWords; w++) { rulesWithWord[w] = new List <IntTaggedWord>(1); } // most have 1 or 2 // items in them // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) { tags = Generics.NewHashSet(); foreach (IntTaggedWord iTW in seenCounter.KeySet()) { if (iTW.Word() == nullWord && iTW.Tag() != nullTag) { tags.Add(iTW); } } // tags for unknown words foreach (IntTaggedWord iT in tags) { double types = uwModel.UnSeenCounter().GetCount(iT); if (types > trainOptions.openClassTypesThreshold) { // Number of types before it's treated as open class IntTaggedWord iTW_1 = new IntTaggedWord(unkWord, iT.tag); rulesWithWord[iTW_1.word].Add(iTW_1); } } if (testOptions.verbose || DebugLexicon) { StringBuilder sb = new StringBuilder(); sb.Append("The ").Append(rulesWithWord[unkWord].Count).Append(" open class tags are: ["); foreach (IntTaggedWord item in rulesWithWord[unkWord]) { sb.Append(' ').Append(tagIndex.Get(item.Tag())); } sb.Append(" ]"); log.Info(sb.ToString()); } foreach (IntTaggedWord iTW_2 in seenCounter.KeySet()) { if (iTW_2.Tag() != nullTag && iTW_2.Word() != nullWord) { rulesWithWord[iTW_2.word].Add(iTW_2); } } }
/// <summary>Checks whether a word is in the lexicon.</summary> /// <remarks> /// Checks whether a word is in the lexicon. This version works even while /// compiling lexicon with current counters (rather than using the compiled /// rulesWithWord array). /// TODO: The previous version would insert rules into the /// wordNumberer. Is that the desired behavior? Why not test in /// some way that doesn't affect the index? For example, start by /// testing wordIndex.contains(word). /// </remarks> /// <param name="word">The word as a String</param> /// <returns>Whether the word is in the lexicon</returns> public virtual bool IsKnown(string word) { if (!wordIndex.Contains(word)) { return(false); } IntTaggedWord iW = new IntTaggedWord(wordIndex.IndexOf(word), nullTag); return(seenCounter.GetCount(iW) > 0.0); }
/// <summary>Adds the tagging with count to the data structures in this Lexicon.</summary> public virtual void AddTagging(bool seen, IntTaggedWord itw, double count) { if (seen) { log.Info("UWM.addTagging: Shouldn't call with seen word!"); } else { unSeenCounter.IncrementCount(itw, count); } }
protected internal virtual IList <IntTaggedWord> ListToEvents(IList <TaggedWord> taggedWords) { IList <IntTaggedWord> itwList = new List <IntTaggedWord>(); foreach (TaggedWord tw in taggedWords) { IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); itwList.Add(iTW); } return(itwList); }
public virtual bool PruneTW(IntTaggedWord argTW) { string[] punctTags = tlp.PunctuationTags(); foreach (string punctTag in punctTags) { if (argTW.tag == tagIndex.IndexOf(punctTag)) { return(true); } } return(false); }
public override float Score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, string word) { double pb_T_S = ScoreProbTagGivenWordSignature(iTW, loc, smooth, word); double p_T = (c_Tseen / total); double p_W = 1.0 / total; double pb_W_T = Math.Log(pb_T_S * p_W / p_T); if (pb_W_T > -100.0) { return((float)pb_W_T); } return(float.NegativeInfinity); }
public AbstractDependencyGrammar(ITreebankLanguagePack tlp, ITagProjection tagProjection, bool directional, bool useDistance, bool useCoarseDistance, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { this.tlp = tlp; this.tagProjection = tagProjection; this.directional = directional; this.useDistance = useDistance; this.useCoarseDistance = useCoarseDistance; this.op = op; this.wordIndex = wordIndex; this.tagIndex = tagIndex; stopTW = new IntTaggedWord(IntTaggedWord.StopWordInt, IntTaggedWord.StopTagInt); wildTW = new IntTaggedWord(IntTaggedWord.AnyWordInt, IntTaggedWord.AnyTagInt); InitTagBins(); }
/// <summary>Generate the possible taggings for a word at a sentence position.</summary> /// <remarks> /// Generate the possible taggings for a word at a sentence position. /// This may either be based on a strict lexicon or an expanded generous /// set of possible taggings. <p> /// <i>Implementation note:</i> Expanded sets of possible taggings are /// calculated dynamically at runtime, so as to reduce the memory used by /// the lexicon (a space/time tradeoff). /// </remarks> /// <param name="word">The word (as an int)</param> /// <param name="loc">Its index in the sentence (usually only relevant for unknown words)</param> /// <returns>A list of possible taggings</returns> public virtual IEnumerator <IntTaggedWord> RuleIteratorByWord(int word, int loc, string featureSpec) { // if (rulesWithWord == null) { // tested in isKnown already // initRulesWithWord(); // } IList <IntTaggedWord> wordTaggings; if (IsKnown(word)) { if (!flexiTag) { // Strict lexical tagging for seen items wordTaggings = rulesWithWord[word]; } else { /* Allow all tags with same basicCategory */ /* Allow all scored taggings, unless very common */ IntTaggedWord iW = new IntTaggedWord(word, nullTag); if (seenCounter.GetCount(iW) > smoothInUnknownsThreshold) { return(rulesWithWord[word].GetEnumerator()); } else { // give it flexible tagging not just lexicon wordTaggings = new List <IntTaggedWord>(40); foreach (IntTaggedWord iTW2 in tags) { IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag); if (Score(iTW, loc, wordIndex.Get(word), null) > float.NegativeInfinity) { wordTaggings.Add(iTW); } } } } } else { // we copy list so we can insert correct word in each item wordTaggings = new List <IntTaggedWord>(40); foreach (IntTaggedWord iTW in rulesWithWord[wordIndex.IndexOf(LexiconConstants.UnknownWord)]) { wordTaggings.Add(new IntTaggedWord(word, iTW.tag)); } } return(wordTaggings.GetEnumerator()); }
public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { EnsureProbs(iTW.Word()); double max = Counters.Max(logProbs); double score = logProbs.GetCount(iTW.TagString(tagIndex)); if (score > max - iteratorCutoffFactor) { return((float)score); } else { return(float.NegativeInfinity); } }
/// <summary> /// This is a custom interner that simultaneously creates and interns /// an IntDependency. /// </summary> /// <returns>An interned IntDependency</returns> protected internal virtual IntDependency Intern(IntTaggedWord headTW, IntTaggedWord argTW, bool leftHeaded, short dist) { IDictionary <IntDependency, IntDependency> map = expandDependencyMap; IntDependency internTempDependency = new IntDependency(itwInterner.Intern(headTW), itwInterner.Intern(argTW), leftHeaded, dist); IntDependency returnDependency = internTempDependency; if (map != null) { returnDependency = map[internTempDependency]; if (returnDependency == null) { map[internTempDependency] = internTempDependency; returnDependency = internTempDependency; } } return(returnDependency); }
public virtual IEnumerator <IntTaggedWord> RuleIteratorByWord(int word, int loc, string featureSpec) { EnsureProbs(word); IList <IntTaggedWord> rules = new List <IntTaggedWord>(); double max = Counters.Max(logProbs); for (int tag = 0; tag < tagIndex.Size(); tag++) { IntTaggedWord iTW = new IntTaggedWord(word, tag); double score = logProbs.GetCount(tagIndex.Get(tag)); if (score > max - iteratorCutoffFactor) { rules.Add(iTW); } } return(rules.GetEnumerator()); }
/// <summary>Trains the first-character based unknown word model.</summary> /// <param name="tw">The word we are currently training on</param> /// <param name="loc">The position of that word</param> /// <param name="weight">The weight to give this word in terms of training</param> public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } string word = tw.Word(); ILabel tagL = new Tag(tw.Tag()); string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } string tag = tw.Tag(); if (!c.Contains(tagL)) { c[tagL] = new ClassicCounter <string>(); } c[tagL].IncrementCount(first, weight); tc.IncrementCount(tagL, weight); seenFirst.Add(first); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iTotal, weight); } } }
/// <summary> /// Populates data in this DependencyGrammar from the character stream /// given by the Reader r. /// </summary> /// <exception cref="System.IO.IOException"/> public override void ReadData(BufferedReader @in) { string Left = "left"; int lineNum = 1; // all lines have one rule per line bool doingStop = false; for (string line = @in.ReadLine(); line != null && line.Length > 0; line = @in.ReadLine()) { try { if (line.Equals("BEGIN_STOP")) { doingStop = true; continue; } string[] fields = StringUtils.SplitOnCharWithQuoting(line, ' ', '\"', '\\'); // split on spaces, quote with doublequote, and escape with backslash // System.out.println("fields:\n" + fields[0] + "\n" + fields[1] + "\n" + fields[2] + "\n" + fields[3] + "\n" + fields[4] + "\n" + fields[5]); short distance = (short)System.Convert.ToInt32(fields[4]); IntTaggedWord tempHead = new IntTaggedWord(fields[0], '/', wordIndex, tagIndex); IntTaggedWord tempArg = new IntTaggedWord(fields[2], '/', wordIndex, tagIndex); IntDependency tempDependency = new IntDependency(tempHead, tempArg, fields[3].Equals(Left), distance); double count = double.Parse(fields[5]); if (doingStop) { ExpandStop(tempDependency, distance, count, false); } else { ExpandArg(tempDependency, distance, count); } } catch (Exception e) { IOException ioe = new IOException("Error on line " + lineNum + ": " + line); ioe.InitCause(e); throw ioe; } // System.out.println("read line " + lineNum + ": " + line); lineNum++; } }
private void ExpandStop(IntDependency dependency, short distBinDist, double count, bool wildForStop) { IntTaggedWord headT = GetCachedITW(dependency.head.tag); IntTaggedWord head = new IntTaggedWord(dependency.head.word, TagBin(dependency.head.tag)); //dependency.head; IntTaggedWord arg = new IntTaggedWord(dependency.arg.word, TagBin(dependency.arg.tag)); //dependency.arg; bool leftHeaded = dependency.leftHeaded; if (arg.word == IntTaggedWord.StopWordInt) { stopCounter.IncrementCount(Intern(head, arg, leftHeaded, distBinDist), count); stopCounter.IncrementCount(Intern(headT, arg, leftHeaded, distBinDist), count); } if (wildForStop || arg.word != IntTaggedWord.StopWordInt) { stopCounter.IncrementCount(Intern(head, wildTW, leftHeaded, distBinDist), count); stopCounter.IncrementCount(Intern(headT, wildTW, leftHeaded, distBinDist), count); } }
// log.info("stopCounter: " + stopCounter); // log.info("argCounter: " + argCounter); //new ArrayList(); /// <summary> /// This maps from a tag to a cached IntTagWord that represents the /// tag by having the wildcard word ANY_WORD_INT and the tag in the /// reduced tag space. /// </summary> /// <remarks> /// This maps from a tag to a cached IntTagWord that represents the /// tag by having the wildcard word ANY_WORD_INT and the tag in the /// reduced tag space. /// The argument is in terms of the full tag space; internally this /// function maps to the reduced space. /// </remarks> /// <param name="tag">short representation of tag in full tag space</param> /// <returns>an IntTaggedWord in the reduced tag space</returns> private IntTaggedWord GetCachedITW(short tag) { // The +2 below is because -1 and -2 are used with special meanings (see IntTaggedWord). if (tagITWList == null) { tagITWList = new List <IntTaggedWord>(numTagBins + 2); for (int i = 0; i < numTagBins + 2; i++) { tagITWList.Add(i, null); } } IntTaggedWord headT = tagITWList[TagBin(tag) + 2]; if (headT == null) { headT = new IntTaggedWord(IntTaggedWord.AnyWordInt, TagBin(tag)); tagITWList.Set(TagBin(tag) + 2, headT); } return(headT); }
public override IUnknownWordModel FinishTraining() { // make sure the unseen counter isn't empty! If it is, put in // a uniform unseen over tags if (unSeenCounter.IsEmpty()) { System.Console.Error.Printf("%s: WARNING: Unseen word counter is empty!", this.GetType().FullName); int numTags = tagIndex.Size(); for (int tt = 0; tt < numTags; tt++) { if (!BoundaryTag.Equals(tagIndex.Get(tt))) { IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, tt); IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw; unSeenCounter.IncrementCount(iT); unSeenCounter.IncrementCount(i); } } } return(model); }
// if (useMaxentUnknownWordModel) { // cml = new ChineseMaxentLexicon(); // } else { //unknown = new ChineseUnknownWordModel(); //this.setUnknownWordModel(new ChineseUnknownWordModel(op)); // this.getUnknownWordModel().setLexicon(this); // } public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { double c_W = seenCounter.GetCount(iTW); bool seen = (c_W > 0.0); if (seen) { return(base.Score(iTW, loc, word, featureSpec)); } else { float score; // if (useMaxentUnknownWordModel) { // score = cml.score(iTW, 0); // } else { score = this.GetUnknownWordModel().Score(iTW, loc, 0.0, 0.0, 0.0, word); // ChineseUnknownWordModel doesn't use the final three params // } return(score); } }
/// <summary> /// Return the probability (as a real number between 0 and 1) of stopping /// rather than generating another argument at this position. /// </summary> /// <param name="dependency"> /// The dependency used as the basis for stopping on. /// Tags are assumed to be in the TagProjection space. /// </param> /// <returns>The probability of generating this stop probability</returns> protected internal virtual double GetStopProb(IntDependency dependency) { short binDistance = DistanceBin(dependency.distance); IntTaggedWord unknownHead = new IntTaggedWord(-1, dependency.head.tag); IntTaggedWord anyHead = new IntTaggedWord(IntTaggedWord.AnyWordInt, dependency.head.tag); IntDependency temp = new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTWds = stopCounter.GetCount(temp); temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTds = stopCounter.GetCount(temp); temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance); double c_hTWds = stopCounter.GetCount(temp); temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance); double c_hTds = stopCounter.GetCount(temp); double p_stop_hTds = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0); double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop); return(pb_stop_hTWds); }