public virtual float Score(IntTaggedWord iTW, int loc, string word, string featureSpec) { EnsureProbs(iTW.Word()); double max = Counters.Max(logProbs); double score = logProbs.GetCount(iTW.TagString(tagIndex)); if (score > max - iteratorCutoffFactor) { return((float)score); } else { return(float.NegativeInfinity); } }
/// <summary> /// Calculate the log-prob score of a particular TaggedWord in the /// unknown word model. /// </summary> /// <param name="itw">the tag->word production in IntTaggedWord form</param> /// <returns>The log-prob score of a particular TaggedWord.</returns> public override float Score(IntTaggedWord itw, string word) { string tag = itw.TagString(tagIndex); if (word.Matches(numberMatch)) { //EncodingPrintWriter.out.println("Number match for " + word,encoding); if (tag.Equals("CARD")) { return(0.0f); } else { //EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging return(float.NegativeInfinity); } } else { return(base.Score(itw, word)); } }
// todo [cdm 2010]: Recheck that this method really does the right thing in making a P(W|T) estimate.... public virtual float Score(IntTaggedWord itw, string word) { float logProb; // Label tag = itw.tagLabel(); string tagStr = itw.TagString(tagIndex); ILabel tag = new Tag(tagStr); // testing //EncodingPrintWriter.out.println("Scoring unknown word " + word + " with tag " + tag,encoding); // end testing if (useEnd || useFirst || useFirstCap) { string end = GetSignature(word, -1); // The getSignature here doesn't use sentence position if (useGT && !seenEnd.Contains(end)) { logProb = ScoreGT(tagStr); } else { if (!seenEnd.Contains(end)) { end = unknown; } //System.out.println("using end-character model for for unknown word "+ word + " for tag " + tag); /* get the Counter of terminal rewrites for the relevant tag */ ClassicCounter <string> wordProbs = tagHash[tag]; /* if the proposed tag has never been seen before, issue a * warning and return probability 0 */ if (wordProbs == null) { log.Info("Warning: proposed tag is unseen in training data:\t" + tagStr); logProb = float.NegativeInfinity; } else { if (wordProbs.KeySet().Contains(end)) { logProb = (float)wordProbs.GetCount(end); } else { logProb = (float)wordProbs.GetCount(unknown); } } } } else { if (useGT) { logProb = ScoreGT(tagStr); } else { log.Info("Warning: no unknown word model in place!\nGiving the combination " + word + ' ' + tagStr + " zero probability."); logProb = float.NegativeInfinity; } } // should never get this! //EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging return(logProb); }
public override float Score(IntTaggedWord itw, string word) { // Label tagL = itw.tagLabel(); // String tag = tagL.value(); string tag = itw.TagString(tagIndex); ILabel tagL = new Tag(tag); float logProb; if (word.Matches(dateMatch)) { //EncodingPrintWriter.out.println("Date match for " + word,encoding); if (tag.Equals("NT")) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } else { if (word.Matches(numberMatch)) { //EncodingPrintWriter.out.println("Number match for " + word,encoding); if (tag.Equals("CD") && (!word.Matches(ordinalMatch))) { logProb = 0.0f; } else { if (tag.Equals("OD") && word.Matches(ordinalMatch)) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } } else { if (word.Matches(properNameMatch)) { //EncodingPrintWriter.out.println("Proper name match for " + word,encoding); if (tag.Equals("NR")) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } else { /* ------------- * // this didn't seem to work -- too categorical * int type = Character.getType(word.charAt(0)); * // the below may not normalize probs over options, but is probably okay * if (type == Character.START_PUNCTUATION) { * if (tag.equals("PU-LPAREN") || tag.equals("PU-PAREN") || * tag.equals("PU-LQUOTE") || tag.equals("PU-QUOTE") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown L Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } else if (type == Character.END_PUNCTUATION) { * if (tag.equals("PU-RPAREN") || tag.equals("PU-PAREN") || * tag.equals("PU-RQUOTE") || tag.equals("PU-QUOTE") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown R Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } else { * if (tag.equals("PU-OTHER") || tag.equals("PU-ENDSENT") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown O Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } * ------------- */ if (useFirst) { string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } if (!seenFirst.Contains(first)) { if (useGT) { logProb = ScoreGT(tag); goto first_break; } else { first = unknown; } } /* get the Counter of terminal rewrites for the relevant tag */ ClassicCounter <string> wordProbs = tagHash[tagL]; /* if the proposed tag has never been seen before, issue a * warning and return probability 0. */ if (wordProbs == null) { logProb = float.NegativeInfinity; } else { if (wordProbs.ContainsKey(first)) { logProb = (float)wordProbs.GetCount(first); } else { logProb = (float)wordProbs.GetCount(unknown); } } } else { if (useGT) { logProb = ScoreGT(tag); } else { logProb = float.NegativeInfinity; } } first_break :; } } } // should never get this! return(logProb); }
public virtual bool Parse <_T0>(IList <_T0> sentence) where _T0 : IHasWord { if (op.testOptions.verbose) { Timing.Tick("Starting dependency parse."); } this.sentence = sentence; int length = sentence.Count; if (length > arraySize) { if (length > op.testOptions.maxLength + 1 || length >= myMaxLength) { throw new OutOfMemoryException("Refusal to create such large arrays."); } else { try { CreateArrays(length + 1); } catch (OutOfMemoryException e) { myMaxLength = length; if (arraySize > 0) { try { CreateArrays(arraySize); } catch (OutOfMemoryException) { throw new Exception("CANNOT EVEN CREATE ARRAYS OF ORIGINAL SIZE!!! " + arraySize); } } throw; } arraySize = length + 1; if (op.testOptions.verbose) { log.Info("Created dparser arrays of size " + arraySize); } } } if (op.testOptions.verbose) { log.Info("Initializing..."); } // map to words words = new int[length]; int numTags = dg.NumTagBins(); //tagIndex.size(); //System.out.println("\nNumTags: "+numTags); //System.out.println(tagIndex); bool[][] hasTag = new bool[length][]; for (int i = 0; i < length; i++) { //if (wordIndex.contains(sentence.get(i).toString())) words[i] = wordIndex.AddToIndex(sentence[i].Word()); } //else //words[i] = wordIndex.indexOf(Lexicon.UNKNOWN_WORD); for (int head = 0; head < length; head++) { for (int tag = 0; tag < numTags; tag++) { Arrays.Fill(iScoreH[head][tag], float.NegativeInfinity); Arrays.Fill(oScoreH[head][tag], float.NegativeInfinity); } } for (int head_1 = 0; head_1 < length; head_1++) { for (int loc = 0; loc <= length; loc++) { rawDistance[head_1][loc] = (head_1 >= loc ? head_1 - loc : loc - head_1 - 1); binDistance[head_1][loc] = dg.DistanceBin(rawDistance[head_1][loc]); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } // do tags for (int start = 0; start + 1 <= length; start++) { //Force tags string trueTagStr = null; if (sentence[start] is IHasTag) { trueTagStr = ((IHasTag)sentence[start]).Tag(); if (string.Empty.Equals(trueTagStr)) { trueTagStr = null; } } //Word context (e.g., morphosyntactic info) string wordContextStr = null; if (sentence[start] is IHasContext) { wordContextStr = ((IHasContext)sentence[start]).OriginalText(); if (string.Empty.Equals(wordContextStr)) { wordContextStr = null; } } int word = words[start]; for (IEnumerator <IntTaggedWord> taggingI = lex.RuleIteratorByWord(word, start, wordContextStr); taggingI.MoveNext();) { IntTaggedWord tagging = taggingI.Current; if (trueTagStr != null) { if (!tlp.BasicCategory(tagging.TagString(tagIndex)).Equals(trueTagStr)) { continue; } } float score = lex.Score(tagging, start, wordIndex.Get(tagging.word), wordContextStr); //iScoreH[start][tag][start] = (op.dcTags ? (float)op.testOptions.depWeight*score : 0.0f); if (score > float.NegativeInfinity) { int tag = tagging.tag; iScoreH[start][dg.TagBin(tag)][start] = 0.0f; iScoreH[start][dg.TagBin(tag)][start + 1] = 0.0f; } } } for (int hWord = 0; hWord < length; hWord++) { for (int hTag = 0; hTag < numTags; hTag++) { hasTag[hWord][hTag] = (iScoreH[hWord][hTag][hWord] + iScoreH[hWord][hTag][hWord + 1] > float.NegativeInfinity); Arrays.Fill(headStop[hWord][hTag], float.NegativeInfinity); for (int aWord = 0; aWord < length; aWord++) { for (int dist = 0; dist < dg.NumDistBins(); dist++) { Arrays.Fill(headScore[dist][hWord][hTag][aWord], float.NegativeInfinity); } } } } // score and cache all pairs -- headScores and stops //int hit = 0; for (int hWord_1 = 0; hWord_1 < length; hWord_1++) { for (int hTag = 0; hTag < numTags; hTag++) { //Arrays.fill(headStopL[hWord][hTag], Float.NEGATIVE_INFINITY); //Arrays.fill(headStopR[hWord][hTag], Float.NEGATIVE_INFINITY); //Arrays.fill(headStop[hWord][hTag], Float.NEGATIVE_INFINITY); if (!hasTag[hWord_1][hTag]) { continue; } for (int split = 0; split <= length; split++) { if (split <= hWord_1) { headStop[hWord_1][hTag][split] = (float)dg.ScoreTB(words[hWord_1], hTag, -2, -2, false, hWord_1 - split); } else { //System.out.println("headstopL " + hWord +" " + hTag + " " + split + " " + headStopL[hWord][hTag][split]); // debugging headStop[hWord_1][hTag][split] = (float)dg.ScoreTB(words[hWord_1], hTag, -2, -2, true, split - hWord_1 - 1); } } //System.out.println("headstopR " + hWord +" " + hTag + " " + split + " " + headStopR[hWord][hTag][split]); // debugging //hit++; //Timing.tick("hWord: "+hWord+" hTag: "+hTag+" piddle count: "+hit); for (int aWord = 0; aWord < length; aWord++) { if (aWord == hWord_1) { continue; } // can't be argument of yourself bool leftHeaded = hWord_1 < aWord; int start_1; int end; if (leftHeaded) { start_1 = hWord_1 + 1; end = aWord + 1; } else { start_1 = aWord + 1; end = hWord_1 + 1; } for (int aTag = 0; aTag < numTags; aTag++) { if (!hasTag[aWord][aTag]) { continue; } for (int split_1 = start_1; split_1 < end; split_1++) { // Moved this stuff out two loops- GMA // for (int split = 0; split <= length; split++) { // if leftHeaded, go from hWord+1 to aWord // else go from aWord+1 to hWord // if ((leftHeaded && (split <= hWord || split > aWord)) || // ((!leftHeaded) && (split <= aWord || split > hWord))) // continue; int headDistance = rawDistance[hWord_1][split_1]; int binDist = binDistance[hWord_1][split_1]; headScore[binDist][hWord_1][hTag][aWord][aTag] = (float)dg.ScoreTB(words[hWord_1], hTag, words[aWord], aTag, leftHeaded, headDistance); //hit++; // skip other splits with same binDist while (split_1 + 1 < end && binDistance[hWord_1][split_1 + 1] == binDist) { split_1++; } } } } } } // end split // end aTag // end aWord // end hTag // end hWord if (op.testOptions.verbose) { Timing.Tick("done."); // displayHeadScores(); log.Info("Starting insides..."); } // do larger spans for (int diff = 2; diff <= length; diff++) { if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } for (int start_1 = 0; start_1 + diff <= length; start_1++) { int end = start_1 + diff; // left extension int endHead = end - 1; for (int endTag = 0; endTag < numTags; endTag++) { if (!hasTag[endHead][endTag]) { continue; } // bestScore is max for iScoreH float bestScore = float.NegativeInfinity; for (int argHead = start_1; argHead < endHead; argHead++) { for (int argTag = 0; argTag < numTags; argTag++) { if (!hasTag[argHead][argTag]) { continue; } float argLeftScore = iScoreH[argHead][argTag][start_1]; if (argLeftScore == float.NegativeInfinity) { continue; } float stopLeftScore = headStop[argHead][argTag][start_1]; if (stopLeftScore == float.NegativeInfinity) { continue; } for (int split = argHead + 1; split < end; split++) { // short circuit if dependency is impossible float depScore = headScore[binDistance[endHead][split]][endHead][endTag][argHead][argTag]; if (depScore == float.NegativeInfinity) { continue; } float score = iScoreH[endHead][endTag][split] + argLeftScore + iScoreH[argHead][argTag][split] + depScore + stopLeftScore + headStop[argHead][argTag][split]; if (score > bestScore) { bestScore = score; } } } } // end for split // sum for iScoreHSum // end for argTag : tags // end for argHead iScoreH[endHead][endTag][start_1] = bestScore; } // end for endTag : tags // right extension int startHead = start_1; for (int startTag = 0; startTag < numTags; startTag++) { if (!hasTag[startHead][startTag]) { continue; } // bestScore is max for iScoreH float bestScore = float.NegativeInfinity; for (int argHead = start_1 + 1; argHead < end; argHead++) { for (int argTag = 0; argTag < numTags; argTag++) { if (!hasTag[argHead][argTag]) { continue; } float argRightScore = iScoreH[argHead][argTag][end]; if (argRightScore == float.NegativeInfinity) { continue; } float stopRightScore = headStop[argHead][argTag][end]; if (stopRightScore == float.NegativeInfinity) { continue; } for (int split = start_1 + 1; split <= argHead; split++) { // short circuit if dependency is impossible float depScore = headScore[binDistance[startHead][split]][startHead][startTag][argHead][argTag]; if (depScore == float.NegativeInfinity) { continue; } float score = iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][split] + argRightScore + depScore + stopRightScore + headStop[argHead][argTag][split]; if (score > bestScore) { bestScore = score; } } } } // sum for iScoreHSum // end for argTag: tags // end for argHead iScoreH[startHead][startTag][end] = bestScore; } } } // end for startTag: tags // end for start // end for diff (i.e., span) int goalTag = dg.TagBin(tagIndex.IndexOf(LexiconConstants.BoundaryTag)); if (op.testOptions.verbose) { Timing.Tick("done."); log.Info("Dep parsing " + length + " words (incl. stop): insideScore " + (iScoreH[length - 1][goalTag][0] + iScoreH[length - 1][goalTag][length])); } if (!op.doPCFG) { return(HasParse()); } if (op.testOptions.verbose) { log.Info("Starting outsides..."); } oScoreH[length - 1][goalTag][0] = 0.0f; oScoreH[length - 1][goalTag][length] = 0.0f; for (int diff_1 = length; diff_1 > 1; diff_1--) { if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } for (int start_1 = 0; start_1 + diff_1 <= length; start_1++) { int end = start_1 + diff_1; // left half int endHead = end - 1; for (int endTag = 0; endTag < numTags; endTag++) { if (!hasTag[endHead][endTag]) { continue; } for (int argHead = start_1; argHead < endHead; argHead++) { for (int argTag = 0; argTag < numTags; argTag++) { if (!hasTag[argHead][argTag]) { continue; } for (int split = argHead; split <= endHead; split++) { float subScore = (oScoreH[endHead][endTag][start_1] + headScore[binDistance[endHead][split]][endHead][endTag][argHead][argTag] + headStop[argHead][argTag][start_1] + headStop[argHead][argTag][split]); float scoreRight = (subScore + iScoreH[argHead][argTag][start_1] + iScoreH[argHead][argTag][split]); float scoreMid = (subScore + iScoreH[argHead][argTag][start_1] + iScoreH[endHead][endTag][split]); float scoreLeft = (subScore + iScoreH[argHead][argTag][split] + iScoreH[endHead][endTag][split]); if (scoreRight > oScoreH[endHead][endTag][split]) { oScoreH[endHead][endTag][split] = scoreRight; } if (scoreMid > oScoreH[argHead][argTag][split]) { oScoreH[argHead][argTag][split] = scoreMid; } if (scoreLeft > oScoreH[argHead][argTag][start_1]) { oScoreH[argHead][argTag][start_1] = scoreLeft; } } } } } // right half int startHead = start_1; for (int startTag = 0; startTag < numTags; startTag++) { if (!hasTag[startHead][startTag]) { continue; } for (int argHead = startHead + 1; argHead < end; argHead++) { for (int argTag = 0; argTag < numTags; argTag++) { if (!hasTag[argHead][argTag]) { continue; } for (int split = startHead + 1; split <= argHead; split++) { float subScore = (oScoreH[startHead][startTag][end] + headScore[binDistance[startHead][split]][startHead][startTag][argHead][argTag] + headStop[argHead][argTag][split] + headStop[argHead][argTag][end]); float scoreLeft = (subScore + iScoreH[argHead][argTag][split] + iScoreH[argHead][argTag][end]); float scoreMid = (subScore + iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][end]); float scoreRight = (subScore + iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][split]); if (scoreLeft > oScoreH[startHead][startTag][split]) { oScoreH[startHead][startTag][split] = scoreLeft; } if (scoreMid > oScoreH[argHead][argTag][split]) { oScoreH[argHead][argTag][split] = scoreMid; } if (scoreRight > oScoreH[argHead][argTag][end]) { oScoreH[argHead][argTag][end] = scoreRight; } } } } } } } if (op.testOptions.verbose) { Timing.Tick("done."); log.Info("Starting half-filters..."); } for (int loc_1 = 0; loc_1 <= length; loc_1++) { for (int head_2 = 0; head_2 < length; head_2++) { Arrays.Fill(iPossibleByL[loc_1][head_2], false); Arrays.Fill(iPossibleByR[loc_1][head_2], false); Arrays.Fill(oPossibleByL[loc_1][head_2], false); Arrays.Fill(oPossibleByR[loc_1][head_2], false); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } for (int head_3 = 0; head_3 < length; head_3++) { for (int tag = 0; tag < numTags; tag++) { if (!hasTag[head_3][tag]) { continue; } for (int start_1 = 0; start_1 <= head_3; start_1++) { for (int end = head_3 + 1; end <= length; end++) { if (iScoreH[head_3][tag][start_1] + iScoreH[head_3][tag][end] > float.NegativeInfinity && oScoreH[head_3][tag][start_1] + oScoreH[head_3][tag][end] > float.NegativeInfinity) { iPossibleByR[end][head_3][tag] = true; iPossibleByL[start_1][head_3][tag] = true; oPossibleByR[end][head_3][tag] = true; oPossibleByL[start_1][head_3][tag] = true; } } } } } if (op.testOptions.verbose) { Timing.Tick("done."); } return(HasParse()); }