/// <summary>Check if a unit exists in the literal string.</summary> /// <remarks> /// Check if a unit exists in the literal string. If so, parse it by making use of /// the compositionality; otherwise return null. /// </remarks> /// <param name="s"/> /// <param name="unit"/> /// <returns/> private static double CompositeAtUnitIfExists(string s, string unit) { // invalid unit if (!quantityUnitToValues.ContainsKey(unit)) { return(null); } int idx = s.IndexOf(unit); if (idx != -1) { double first = double.ValueOf(1.0); // Here we need special handling for 十 and 百 when they occur as the first char // As in Chinese 十二 is very common, 百二十 is sometimes valid as well. if (("十".Equals(unit) || "百".Equals(unit)) && idx == 0) { } else { // do nothing // otherwise we try to parse the value before the unit first = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, 0, idx)); } double second = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, idx + 1)); if (first != null && second != null) { return(double.ValueOf(first * quantityUnitToValues.GetCount(unit) + second)); } } // return null if unit is not present or fails to parse return(null); }
public virtual void RunCoref(Document document) { IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document); if (mentionPairs.Count == 0) { return; } Compressor <string> compressor = new Compressor <string>(); DocumentExamples examples = extractor.Extract(0, document, mentionPairs, compressor); ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >(); ICounter <Pair <int, int> > rankingScores = new ClassicCounter <Pair <int, int> >(); ICounter <int> anaphoricityScores = new ClassicCounter <int>(); foreach (Example example in examples.examples) { CorefUtils.CheckForInterrupt(); Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2); classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor)); rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor)); if (!anaphoricityScores.ContainsKey(example.mentionId2)) { anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor)); } } ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null))); foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc)) { CorefUtils.MergeCoreferenceClusters(mentionPair_1, document); } }
public State(ClustererDataLoader.ClustererDoc doc) { currentDocId = doc.id; this.doc = doc; this.hashedScores = new Dictionary <Clusterer.MergeKey, bool>(); this.hashedCosts = new Dictionary <long, double>(); this.clusters = new List <Clusterer.Cluster>(); this.hash = 0; mentionToCluster = new Dictionary <int, Clusterer.Cluster>(); foreach (int m in doc.mentions) { Clusterer.Cluster c = new Clusterer.Cluster(m); clusters.Add(c); mentionToCluster[m] = c; hash ^= c.hash * 7; } IList <Pair <int, int> > allPairs = new List <Pair <int, int> >(doc.classificationScores.KeySet()); ICounter <Pair <int, int> > scores = UseRanking ? doc.rankingScores : doc.classificationScores; allPairs.Sort(null); int i = 0; for (i = 0; i < allPairs.Count; i++) { double score = scores.GetCount(allPairs[i]); if (score < MinPairwiseScore && i > MinPairs) { break; } if (i >= EarlyStopThreshold && i / score > EarlyStopVal) { break; } } mentionPairs = allPairs.SubList(0, i); ICounter <int> seenAnaphors = new ClassicCounter <int>(); ICounter <int> seenAntecedents = new ClassicCounter <int>(); globalFeatures = new List <Clusterer.GlobalFeatures>(); for (int j = 0; j < allPairs.Count; j++) { Pair <int, int> mentionPair = allPairs[j]; Clusterer.GlobalFeatures gf = new Clusterer.GlobalFeatures(); gf.currentIndex = j; gf.anaphorSeen = seenAnaphors.ContainsKey(mentionPair.second); gf.size = mentionPairs.Count; gf.docSize = doc.mentions.Count / 300.0; globalFeatures.Add(gf); seenAnaphors.IncrementCount(mentionPair.second); seenAntecedents.IncrementCount(mentionPair.first); } currentIndex = 0; SetClusters(); }
/// <summary>Recursively parse a integer String expressed in either Chinese or a mix of Chinese and arabic numbers.</summary> /// <param name="s"/> /// <returns/> private static double RecurNormalizeLiteralIntegerString(string s) { // If empty, return 0 if (s.IsEmpty()) { return(double.ValueOf(0)); } // TODO: check if it is valid. It is possible that this is a vague number like "五六十" which cannot be parsed by current implementation. // In case of pure arabic numbers, return the straight value of it if (ArabicNumbersPattern.Matcher(s).Matches()) { return(double.ValueOf(s)); } //If s has more than 1 char and first char is 零 or 〇, it is likely // to be useless if (s.Length > 1 && (s.StartsWith("零") || s.StartsWith("〇"))) { s = Sharpen.Runtime.Substring(s, 1); } //If there is only one char left and we can quantify it, we return the value of it if (s.Length == 1 && wordsToValues.ContainsKey(s)) { return(double.ValueOf(wordsToValues.GetCount(s))); } // Now parse the integer, making use of the compositionality of Chinese literal numbers double value; value = CompositeAtUnitIfExists(s, "亿"); if (value != null) { return(value); } else { value = CompositeAtUnitIfExists(s, "万"); } if (value != null) { return(value); } else { value = CompositeAtUnitIfExists(s, "千"); } if (value != null) { return(value); } else { value = CompositeAtUnitIfExists(s, "百"); } if (value != null) { return(value); } else { value = CompositeAtUnitIfExists(s, "十"); } if (value != null) { return(value); } // otherwise we fail to parse and just return null return(null); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat) { // if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) { // Data.loadGoogleNGrams(); // } ICounter <E> patterns = new ClassicCounter <E>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>(); double externalWtsDefault = 0.5; ICounter <string> classifierScores = null; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { foreach (CandidatePhrase gc in allCandidatePhrases) { string g = gc.GetPhrase(); if (constVars.usePatternEvalEditDistOther) { editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePatternEvalEditDistSame) { editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc)); } if (constVars.usePatternEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g)); } } if (constVars.usePatternEvalWordClass) { int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, externalWtsDefault); } } } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); } if (constVars.usePatternEvalDomainNgram) { domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); } if (constVars.usePatternEvalWordClass) { externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { Properties props2 = new Properties(); props2.PutAll(props); props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt"); ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars); System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile")); ArgumentParser.FillOptions(typeof(Data), props2); classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true); } } ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet()) { foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet()) { CandidatePhrase word = en2.Key; ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); double score = 1; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { if (cachedScoresForThisIter.ContainsKey(word)) { score = cachedScoresForThisIter.GetCount(word); } else { if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word)) { score = 1; } else { if (constVars.usePatternEvalSemanticOdds) { double semanticClassOdds = 1; if (dictOddsWordWeights.ContainsKey(word)) { semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds); } if (constVars.usePatternEvalGoogleNgram) { double gscore = 0; if (googleNgramNormScores.ContainsKey(word)) { gscore = 1 - googleNgramNormScores.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore); } if (constVars.usePatternEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = 1 - domainNgramNormScores.GetCount(word); } else { domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePatternEvalWordClass) { double externalFeatureWt = externalWtsDefault; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePatternEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word)); } if (constVars.usePatternEvalEditDistSame) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word)); } // taking average score = Counters.Mean(scoreslist); phInPatScores.SetCounter(word, scoreslist); } cachedScoresForThisIter.SetCount(word, score); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { score = 1 - classifierScores.GetCount(word); } } // score = 1 - scorePhrases.scoreUsingClassifer(classifier, // e.getKey(), label, true, null, null, dictOddsWordWeights); // throw new RuntimeException("not implemented yet"); if (useFreqPhraseExtractedByPat) { score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word)); } if (constVars.sqrtPatScore) { patterns.IncrementCount(en.Key, Math.Sqrt(score)); } else { patterns.IncrementCount(en.Key, score); } } } return(patterns); }
internal override ICounter <CandidatePhrase> ScorePhrases(string label, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase > alreadyIdentifiedWords, bool forLearningPatterns) { IDictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > scores = new Dictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> >(); if (Data.domainNGramsFile != null) { Data.LoadDomainNGrams(); } Redwood.Log(ConstantsAndVariables.extremedebug, "Considering terms: " + terms.FirstKeySet()); // calculate TF-IDF like scores ICounter <CandidatePhrase> tfidfScores = new ClassicCounter <CandidatePhrase>(); if (constVars.usePhraseEvalPatWtByFreq) { foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { double score = GetPatTFIDFScore(en.Key, en.Value, allSelectedPatterns); tfidfScores.SetCount(en.Key, score); } Redwood.Log(ConstantsAndVariables.extremedebug, "BEFORE IDF " + Counters.ToSortedString(tfidfScores, 100, "%1$s:%2$f", "\t")); Counters.DivideInPlace(tfidfScores, Data.processedDataFreq); } ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceOtherBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceSameBinaryScores = new ClassicCounter <CandidatePhrase>(); foreach (CandidatePhrase gc in terms.FirstKeySet()) { string g = gc.GetPhrase(); if (constVars.usePhraseEvalEditDistOther) { editDistanceOtherBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePhraseEvalEditDistSame) { editDistanceSameBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePhraseEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, GetDomainNgramScore(g)); } else { log.Info("why is " + g + " not present in domainNgram"); } } if (constVars.usePhraseEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, GetGoogleNgramScore(gc)); } if (constVars.usePhraseEvalWordClass) { // calculate dist sim weights int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, OOVExternalFeatWt); } } } ICounter <CandidatePhrase> normTFIDFScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(tfidfScores, true, true, false); ICounter <CandidatePhrase> dictOdddsScores = null; if (constVars.usePhraseEvalSemanticOdds) { System.Diagnostics.Debug.Assert(constVars.dictOddsWeights != null, "usePhraseEvalSemanticOdds is true but dictOddsWeights is null for the label " + label); dictOdddsScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false); } domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); // Counters.max(googleNgramNormScores); // Counters.max(externalFeatWtsNormalized); foreach (CandidatePhrase word in terms.FirstKeySet()) { if (alreadyIdentifiedWords.Contains(word)) { continue; } ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); System.Diagnostics.Debug.Assert(normTFIDFScores.ContainsKey(word), "NormTFIDF score does not contain" + word); double tfscore = normTFIDFScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Patwtbyfreq, tfscore); if (constVars.usePhraseEvalSemanticOdds) { double dscore; if (dictOdddsScores.ContainsKey(word)) { dscore = dictOdddsScores.GetCount(word); } else { dscore = GetPhraseWeightFromWords(dictOdddsScores, word, OOVdictOdds); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, dscore); } if (constVars.usePhraseEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = domainNgramNormScores.GetCount(word); } else { domainscore = GetPhraseWeightFromWords(domainNgramNormScores, word, OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePhraseEvalGoogleNgram) { double googlescore; if (googleNgramNormScores.ContainsKey(word)) { googlescore = googleNgramNormScores.GetCount(word); } else { googlescore = GetPhraseWeightFromWords(googleNgramNormScores, word, OOVGoogleNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, googlescore); } if (constVars.usePhraseEvalWordClass) { double externalFeatureWt; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = externalFeatWtsNormalized.GetCount(word); } else { externalFeatureWt = GetPhraseWeightFromWords(externalFeatWtsNormalized, word, OOVExternalFeatWt); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePhraseEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceOtherBinaryScores.ContainsKey(word), "How come no edit distance info?"); double editD = editDistanceOtherBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editD); } if (constVars.usePhraseEvalEditDistSame) { double editDSame = editDistanceSameBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDSame); } if (constVars.usePhraseEvalWordShape) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Wordshape, this.GetWordShapeScore(word.GetPhrase(), label)); } scores[word] = scoreslist; phraseScoresNormalized.SetCounter(word, scoreslist); } ICounter <CandidatePhrase> phraseScores = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > wEn in scores) { double avgScore = Counters.Mean(wEn.Value); if (!avgScore.IsInfinite() && !double.IsNaN(avgScore)) { phraseScores.SetCount(wEn.Key, avgScore); } else { Redwood.Log(Redwood.Dbg, "Ignoring " + wEn.Key + " because score is " + avgScore); } } return(phraseScores); }
public override float Score(IntTaggedWord itw, string word) { // Label tagL = itw.tagLabel(); // String tag = tagL.value(); string tag = itw.TagString(tagIndex); ILabel tagL = new Tag(tag); float logProb; if (word.Matches(dateMatch)) { //EncodingPrintWriter.out.println("Date match for " + word,encoding); if (tag.Equals("NT")) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } else { if (word.Matches(numberMatch)) { //EncodingPrintWriter.out.println("Number match for " + word,encoding); if (tag.Equals("CD") && (!word.Matches(ordinalMatch))) { logProb = 0.0f; } else { if (tag.Equals("OD") && word.Matches(ordinalMatch)) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } } else { if (word.Matches(properNameMatch)) { //EncodingPrintWriter.out.println("Proper name match for " + word,encoding); if (tag.Equals("NR")) { logProb = 0.0f; } else { logProb = float.NegativeInfinity; } } else { /* ------------- * // this didn't seem to work -- too categorical * int type = Character.getType(word.charAt(0)); * // the below may not normalize probs over options, but is probably okay * if (type == Character.START_PUNCTUATION) { * if (tag.equals("PU-LPAREN") || tag.equals("PU-PAREN") || * tag.equals("PU-LQUOTE") || tag.equals("PU-QUOTE") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown L Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } else if (type == Character.END_PUNCTUATION) { * if (tag.equals("PU-RPAREN") || tag.equals("PU-PAREN") || * tag.equals("PU-RQUOTE") || tag.equals("PU-QUOTE") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown R Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } else { * if (tag.equals("PU-OTHER") || tag.equals("PU-ENDSENT") || * tag.equals("PU")) { * // if (VERBOSE) log.info("ChineseUWM: unknown O Punc"); * logProb = 0.0f; * } else { * logProb = Float.NEGATIVE_INFINITY; * } * } * ------------- */ if (useFirst) { string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } if (!seenFirst.Contains(first)) { if (useGT) { logProb = ScoreGT(tag); goto first_break; } else { first = unknown; } } /* get the Counter of terminal rewrites for the relevant tag */ ClassicCounter <string> wordProbs = tagHash[tagL]; /* if the proposed tag has never been seen before, issue a * warning and return probability 0. */ if (wordProbs == null) { logProb = float.NegativeInfinity; } else { if (wordProbs.ContainsKey(first)) { logProb = (float)wordProbs.GetCount(first); } else { logProb = (float)wordProbs.GetCount(unknown); } } } else { if (useGT) { logProb = ScoreGT(tag); } else { logProb = float.NegativeInfinity; } } first_break :; } } } // should never get this! return(logProb); }
public static void Main(string[] args) { System.Console.Out.WriteLine("Testing unknown matching"); string s = "\u5218\u00b7\u9769\u547d"; if (s.Matches(properNameMatch)) { System.Console.Out.WriteLine("hooray names!"); } else { System.Console.Out.WriteLine("Uh-oh names!"); } string s1 = "\uff13\uff10\uff10\uff10"; if (s1.Matches(numberMatch)) { System.Console.Out.WriteLine("hooray numbers!"); } else { System.Console.Out.WriteLine("Uh-oh numbers!"); } string s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c"; if (s11.Matches(numberMatch)) { System.Console.Out.WriteLine("hooray numbers!"); } else { System.Console.Out.WriteLine("Uh-oh numbers!"); } string s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d"; if (s12.Matches(numberMatch)) { System.Console.Out.WriteLine("hooray numbers!"); } else { System.Console.Out.WriteLine("Uh-oh numbers!"); } string s2 = "\u4e09\u6708"; if (s2.Matches(dateMatch)) { System.Console.Out.WriteLine("hooray dates!"); } else { System.Console.Out.WriteLine("Uh-oh dates!"); } System.Console.Out.WriteLine("Testing tagged word"); ClassicCounter <TaggedWord> c = new ClassicCounter <TaggedWord>(); TaggedWord tw1 = new TaggedWord("w", "t"); c.IncrementCount(tw1); TaggedWord tw2 = new TaggedWord("w", "t2"); System.Console.Out.WriteLine(c.ContainsKey(tw2)); System.Console.Out.WriteLine(tw1.Equals(tw2)); WordTag wt1 = ToWordTag(tw1); WordTag wt2 = ToWordTag(tw2); WordTag wt3 = new WordTag("w", "t2"); System.Console.Out.WriteLine(wt1.Equals(wt2)); System.Console.Out.WriteLine(wt2.Equals(wt3)); }