/// <summary> /// Calculate sister annotation statistics suitable for doing /// selective sister splitting in the PCFGParser inside the /// FactoredParser. /// </summary> /// <param name="args">One argument: path to the Treebank</param> public static void Main(string[] args) { ClassicCounter <string> c = new ClassicCounter <string>(); c.SetCount("A", 0); c.SetCount("B", 1); double d = Counters.KlDivergence(c, c); System.Console.Out.WriteLine("KL Divergence: " + d); string encoding = "UTF-8"; if (args.Length > 1) { encoding = args[1]; } if (args.Length < 1) { System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath"); } else { SisterAnnotationStats pas = new SisterAnnotationStats(); Treebank treebank = new DiskTreebank(null, encoding); treebank.LoadPath(args[0]); treebank.Apply(pas); pas.PrintStats(); } }
public virtual void TestGetDistributionFromLogValues() { ICounter <string> c1 = new ClassicCounter <string>(); c1.SetCount("p", 1.0); c1.SetCount("q", 2.0); c1.SetCount("r", 3.0); c1.SetCount("s", 4.0); // take log Counters.LogInPlace(c1); // now call distribution Distribution <string> distribution = Distribution.GetDistributionFromLogValues(c1); // test NUnit.Framework.Assert.AreEqual(distribution.KeySet().Count, 4); // size // keys NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("p"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("q"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("r"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("s"), true); // values NUnit.Framework.Assert.AreEqual(distribution.GetCount("p"), 1.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("q"), 2.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("r"), 3.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("s"), 4.0E-1, 1E-10); }
/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary> /// <param name="datum">with a collection of features.</param> /// <param name="featureDocCounts">a counter of doc-count for each feature.</param> /// <returns>RVFDatum with l1-normalized tf-idf features.</returns> public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts) { ICounter <F> tfidfFeatures = new ClassicCounter <F>(); foreach (F feature in datum.AsFeatures()) { if (featureDocCounts.ContainsKey(feature)) { tfidfFeatures.IncrementCount(feature, 1.0); } } double l1norm = 0; foreach (F feature_1 in tfidfFeatures.KeySet()) { double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5)); double tf = tfidfFeatures.GetCount(feature_1); tfidfFeatures.SetCount(feature_1, tf * idf); l1norm += tf * idf; } foreach (F feature_2 in tfidfFeatures.KeySet()) { double tfidf = tfidfFeatures.GetCount(feature_2); tfidfFeatures.SetCount(feature_2, tfidf / l1norm); } RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label()); return(rvfDatum); }
private ICounter <L> ScoresOfRVFDatum(RVFDatum <L, F> example) { ICounter <F> features = example.AsFeaturesCounter(); double sum = ScoreOf(features); ICounter <L> c = new ClassicCounter <L>(); c.SetCount(classes[0], -sum); c.SetCount(classes[1], sum); return(c); }
/// <summary>returns the scores for both the classes</summary> public virtual ICounter <L> ScoresOf(IDatum <L, F> datum) { if (datum is RVFDatum <object, object> ) { return(ScoresOfRVFDatum((RVFDatum <L, F>)datum)); } ICollection <F> features = datum.AsFeatures(); double sum = ScoreOf(features); ICounter <L> c = new ClassicCounter <L>(); c.SetCount(classes[0], -sum); c.SetCount(classes[1], sum); return(c); }
public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier) { Span span = new Span(start, end); string type = null; string subtype = null; if (!label.StartsWith("B-") && !label.StartsWith("I-")) { type = label; subtype = null; } else { // TODO: add support for subtypes! (needed at least in ACE) type = Sharpen.Runtime.Substring(label, 2); subtype = null; } // TODO: add support for subtypes! (needed at least in ACE) EntityMention entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null); ICounter <string> probs = new ClassicCounter <string>(); probs.SetCount(entity.GetType(), 1.0); entity.SetTypeProbabilities(probs); return(entity); }
public override ICounter <E> Score() { ICounter <E> specificity = new ClassicCounter <E>(); ICounter <E> sensitivity = new ClassicCounter <E>(); if (p0Set.KeySet().Count == 0) { throw new Exception("how come p0set size is empty for " + p0 + "?"); } foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet()) { int common = CollectionUtils.Intersection(en.Value.KeySet(), p0Set.KeySet()).Count; if (common == 0) { continue; } if (en.Value.KeySet().Count == 0) { throw new Exception("how come counter for " + en.Key + " is empty?"); } specificity.SetCount(en.Key, common / (double)en.Value.KeySet().Count); sensitivity.SetCount(en.Key, common / (double)p0Set.Size()); } Counters.RetainNonZeros(specificity); Counters.RetainNonZeros(sensitivity); ICounter <E> add = Counters.Add(sensitivity, specificity); ICounter <E> product = Counters.Product(sensitivity, specificity); Counters.RetainNonZeros(product); Counters.RetainKeys(product, add.KeySet()); ICounter <E> finalPat = Counters.Scale(Counters.Division(product, add), 2); return(finalPat); }
public override IUnknownWordModel FinishTraining() { if (useGT) { unknownGTTrainer.FinishTraining(); } foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c) { /* outer iteration is over tags */ ILabel key = entry.Key; ClassicCounter <string> wc = entry.Value; // counts for words given a tag if (!tagHash.Contains(key)) { tagHash[key] = new ClassicCounter <string>(); } /* the UNKNOWN sequence is assumed to be seen once in each tag */ // This is sort of broken, but you can regard it as a Dirichlet prior. tc.IncrementCount(key); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); /* inner iteration is over words */ foreach (string end in wc.KeySet()) { double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key))); // p(sig|tag) tagHash[key].SetCount(end, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding); return(model); }
public override IUnknownWordModel FinishTraining() { // Map<String,Float> unknownGT = null; if (useGT) { unknownGTTrainer.FinishTraining(); } // unknownGT = unknownGTTrainer.unknownGT; foreach (ILabel tagLab in c.Keys) { // outer iteration is over tags as Labels ClassicCounter <string> wc = c[tagLab]; // counts for words given a tag if (!tagHash.Contains(tagLab)) { tagHash[tagLab] = new ClassicCounter <string>(); } // the UNKNOWN first character is assumed to be seen once in // each tag // this is really sort of broken! (why??) tc.IncrementCount(tagLab); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); // inner iteration is over words as strings foreach (string first in wc.KeySet()) { double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab)); tagHash[tagLab].SetCount(first, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding); return(model); }
private static void LoadSignatures(string file, IDictionary <string, ICounter <string> > sigs) { BufferedReader reader = null; try { reader = IOUtils.ReaderFromString(file); while (reader.Ready()) { string[] split = reader.ReadLine().Split("\t"); ICounter <string> cntr = new ClassicCounter <string>(); sigs[split[0]] = cntr; for (int i = 1; i < split.Length; i = i + 2) { cntr.SetCount(split[i], double.ParseDouble(split[i + 1])); } } } catch (IOException e) { throw new Exception(e); } finally { IOUtils.CloseIgnoringExceptions(reader); } }
// Does L1 or L2 using FOBOS and lazy update, so L1 should not be handled in the // objective // Alternatively, you can handle other regularization in the objective, // but then, if the derivative is not sparse, this routine would not be very // efficient. However, might still be okay for CRFs public virtual ICounter <K> Minimize(F function, ICounter <K> x, int maxIterations) { Sayln(" Batch size of: " + batchSize); Sayln(" Data dimension of: " + function.DataSize()); int numBatches = (function.DataSize() - 1) / this.batchSize + 1; Sayln(" Batches per pass through data: " + numBatches); Sayln(" Number of passes is = " + numPasses); Sayln(" Max iterations is = " + maxIterations); ICounter <K> lastUpdated = new ClassicCounter <K>(); int timeStep = 0; Timing total = new Timing(); total.Start(); for (int iter = 0; iter < numPasses; iter++) { double totalObjValue = 0; for (int j = 0; j < numBatches; j++) { int[] selectedData = GetSample(function, this.batchSize); // the core adagrad ICounter <K> gradient = function.DerivativeAt(x, selectedData); totalObjValue = totalObjValue + function.ValueAt(x, selectedData); foreach (K feature in gradient.KeySet()) { double gradf = gradient.GetCount(feature); double prevrate = eta / (Math.Sqrt(sumGradSquare.GetCount(feature)) + soften); double sgsValue = sumGradSquare.IncrementCount(feature, gradf * gradf); double currentrate = eta / (Math.Sqrt(sgsValue) + soften); double testupdate = x.GetCount(feature) - (currentrate * gradient.GetCount(feature)); double lastUpdateTimeStep = lastUpdated.GetCount(feature); double idleinterval = timeStep - lastUpdateTimeStep - 1; lastUpdated.SetCount(feature, (double)timeStep); // does lazy update using idleinterval double trunc = Math.Max(0.0, (Math.Abs(testupdate) - (currentrate + prevrate * idleinterval) * this.lambdaL1)); double trunc2 = trunc * Math.Pow(1 - this.lambdaL2, currentrate + prevrate * idleinterval); double realupdate = Math.Signum(testupdate) * trunc2; if (realupdate < Eps) { x.Remove(feature); } else { x.SetCount(feature, realupdate); } // reporting timeStep++; if (timeStep > maxIterations) { Sayln("Stochastic Optimization complete. Stopped after max iterations"); break; } Sayln(System.Console.Out.Format("Iter %d \t batch: %d \t time=%.2f \t obj=%.4f", iter, timeStep, total.Report() / 1000.0, totalObjValue).ToString()); } } } return(x); }
// Quick little sanity check public static void Main(string[] args) { ICollection <RVFDatum <string, string> > trainingInstances = new List <RVFDatum <string, string> >(); { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 5.0); f1.SetCount("temperature", 35.0); trainingInstances.Add(new RVFDatum <string, string>(f1, "rain")); } { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 4.0); f1.SetCount("temperature", 32.0); trainingInstances.Add(new RVFDatum <string, string>(f1, "rain")); } { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 6.0); f1.SetCount("temperature", 30.0); trainingInstances.Add(new RVFDatum <string, string>(f1, "rain")); } { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 2.0); f1.SetCount("temperature", 33.0); trainingInstances.Add(new RVFDatum <string, string>(f1, "dry")); } { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 1.0); f1.SetCount("temperature", 34.0); trainingInstances.Add(new RVFDatum <string, string>(f1, "dry")); } Edu.Stanford.Nlp.Classify.KNNClassifier <string, string> classifier = new KNNClassifierFactory <string, string>(3, false, true).Train(trainingInstances); { ClassicCounter <string> f1 = new ClassicCounter <string>(); f1.SetCount("humidity", 2.0); f1.SetCount("temperature", 33.0); RVFDatum <string, string> testVec = new RVFDatum <string, string>(f1); System.Console.Out.WriteLine(classifier.ScoresOf(testVec)); System.Console.Out.WriteLine(classifier.ClassOf(testVec)); } }
public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords , double thresholdWordExtract) { IEnumerator <CandidatePhrase> termIter = Counters.ToPriorityQueue(newdt).GetEnumerator(); ICounter <CandidatePhrase> finalwords = new ClassicCounter <CandidatePhrase>(); while (termIter.MoveNext()) { if (finalwords.Size() >= constVars.numWordsToAdd) { break; } CandidatePhrase w = termIter.Current; if (newdt.GetCount(w) < thresholdWordExtract) { Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of " + thresholdWordExtract); break; } System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity)); if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) { Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet()); continue; } CandidatePhrase matchedFuzzy = null; if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null) { matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern); } if (matchedFuzzy == null) { Redwood.Log("extremePatDebug", "adding word " + w); finalwords.SetCount(w, newdt.GetCount(w)); } else { Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word"); ignoreWords.Add(w); } } string nextTen = string.Empty; int n = 0; while (termIter.MoveNext()) { n++; if (n > 10) { break; } CandidatePhrase w = termIter.Current; nextTen += ";\t" + w + ":" + newdt.GetCount(w); } Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen); return(finalwords); }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
public virtual ICounter <F> WeightsAsCounter() { ICounter <F> c = new ClassicCounter <F>(); foreach (F f in featureIndex) { double w = weights[featureIndex.IndexOf(f)]; if (w != 0.0) { c.SetCount(f, w); } } return(c); }
public virtual ICounter <int> LengthAccuracies() { ICollection <int> keys = Generics.NewHashSet(); Sharpen.Collections.AddAll(keys, lengthLabelsCorrect.KeySet()); Sharpen.Collections.AddAll(keys, lengthLabelsIncorrect.KeySet()); ICounter <int> results = new ClassicCounter <int>(); foreach (int key in keys) { results.SetCount(key, lengthLabelsCorrect.GetCount(key) / (lengthLabelsCorrect.GetCount(key) + lengthLabelsIncorrect.GetCount(key))); } return(results); }
public virtual void TestSimplerTokens() { IDictionary <Type, string> prev = new _Dictionary_44(); IDictionary <Type, string> next = new _Dictionary_49(); PatternToken token = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext); IDictionary <Type, string> prev2 = new _Dictionary_58(); IDictionary <Type, string> next2 = new _Dictionary_63(); PatternToken token2 = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p2 = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext); System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0); ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>(); pats.SetCount(p, 1); pats.SetCount(p2, 1); System.Diagnostics.Debug.Assert(pats.Size() == 1); System.Console.Out.WriteLine("pats size is " + pats.Size()); ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>(); index.Add(p); index.Add(p2); System.Diagnostics.Debug.Assert(index.Count == 1); }
public virtual ICounter <L> ScoresOf(IDatum <L, F> example) { ICounter <L> scores = new ClassicCounter <L>(); foreach (L label in labelIndex) { IDictionary <L, string> posLabelMap = new ArrayMap <L, string>(); posLabelMap[label] = PosLabel; IDatum <string, F> binDatum = GeneralDataset.MapDatum(example, posLabelMap, NegLabel); IClassifier <string, F> binaryClassifier = GetBinaryClassifier(label); ICounter <string> binScores = binaryClassifier.ScoresOf(binDatum); double score = binScores.GetCount(PosLabel); scores.SetCount(label, score); } return(scores); }
public virtual IDictionary <L, ICounter <F> > WeightsAsGenericCounter() { IDictionary <L, ICounter <F> > allweights = new Dictionary <L, ICounter <F> >(); for (int i = 0; i < weights.Length; i++) { ICounter <F> c = new ClassicCounter <F>(); L label = labelIndex.Get(i); double[] w = weights[i]; foreach (F f in featureIndex) { int indexf = featureIndex.IndexOf(f); if (w[indexf] != 0.0) { c.SetCount(f, w[indexf]); } } allweights[label] = c; } return(allweights); }
// todo: Fix javadoc, have unit tested /// <summary>Print SVM Light Format file.</summary> /// <remarks> /// Print SVM Light Format file. /// The following comments are no longer applicable because I am /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009. /// If the Dataset has more than 2 classes, then it /// prints using the label index (+1) (for svm_struct). If it is 2 classes, then the labelIndex.get(0) /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light). /// </remarks> public virtual void PrintSVMLightFormat(PrintWriter pw) { //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter // old comment: // the following code commented out by Ramesh ([email protected]) 12/17/2009. // why not simply print the exact id of the label instead of mapping to some values?? // new comment: // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels, // e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010) string[] labelMap = MakeSvmLabelMap(); for (int i = 0; i < size; i++) { RVFDatum <L, F> d = GetRVFDatum(i); ICounter <F> c = d.AsFeaturesCounter(); ClassicCounter <int> printC = new ClassicCounter <int>(); foreach (F f in c.KeySet()) { printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f)); } int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]); Arrays.Sort(features); StringBuilder sb = new StringBuilder(); sb.Append(labelMap[labels[i]]).Append(' '); // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions! /* Old code: assumes that F is Integer.... * * for (int f: features) { * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" "); * } */ //I think this is what was meant (using printC rather than c), but not sure // ~Sarah Spikes ([email protected]) foreach (int f_1 in features) { sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' '); } pw.Println(sb.ToString()); } }
/// <summary> /// Given an instance to classify, scores and returns /// score by class. /// </summary> /// <remarks> /// Given an instance to classify, scores and returns /// score by class. /// NOTE: supports only RVFDatums /// </remarks> public virtual ClassicCounter <K> ScoresOf(IDatum <K, V> datum) { if (datum is RVFDatum <object, object> ) { RVFDatum <K, V> vec = (RVFDatum <K, V>)datum; if (l2Normalize) { ClassicCounter <V> featVec = new ClassicCounter <V>(vec.AsFeaturesCounter()); Counters.Normalize(featVec); vec = new RVFDatum <K, V>(featVec); } ClassicCounter <ICounter <V> > scores = new ClassicCounter <ICounter <V> >(); foreach (ICounter <V> instance in instances.AllValues()) { scores.SetCount(instance, Counters.Cosine(vec.AsFeaturesCounter(), instance)); } // set entry, for given instance and score IList <ICounter <V> > sorted = Counters.ToSortedList(scores); ClassicCounter <K> classScores = new ClassicCounter <K>(); for (int i = 0; i < k && i < sorted.Count; i++) { K label = classLookup[sorted[i]]; double count = 1.0; if (weightedVotes) { count = scores.GetCount(sorted[i]); } classScores.IncrementCount(label, count); } return(classScores); } else { return(null); } }
public virtual void PrintStats() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); // System.out.println("Node rules"); // System.out.println(nodeRules); // System.out.println("Parent rules"); // System.out.println(pRules); // System.out.println("Grandparent rules"); // System.out.println(gPRules); // Store java code for selSplit StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length]; for (int i = 0; i < Cutoffs.Length; i++) { javaSB[i] = new StringBuilder(" private static String[] splitters" + (i + 1) + " = new String[] {"); } ClassicCounter <IList <string> > allScores = new ClassicCounter <IList <string> >(); // do value of parent foreach (string node in nodeRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = nodeRules[node]; double support = (cntr.TotalCount()); System.Console.Out.WriteLine("Node " + node + " support is " + support); foreach (IList <string> key in pRules.Keys) { if (key[0].Equals(node)) { // only do it if they match ClassicCounter <IList <string> > cntr2 = pRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(new Pair <IList <string>, double>(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("\", "); } } } } System.Console.Out.WriteLine(); } /* * // do value of parent with info gain -- yet to finish this * for (Iterator it = nodeRules.entrySet().iterator(); it.hasNext(); ) { * Map.Entry pair = (Map.Entry) it.next(); * String node = (String) pair.getKey(); * Counter cntr = (Counter) pair.getValue(); * double support = (cntr.totalCount()); * System.out.println("Node " + node + " support is " + support); * ArrayList dtrs = new ArrayList(); * for (Iterator it2 = pRules.entrySet().iterator(); it2.hasNext();) { * HashMap annotated = new HashMap(); * Map.Entry pair2 = (Map.Entry) it2.next(); * List node2 = (List) pair2.getKey(); * Counter cntr2 = (Counter) pair2.getValue(); * if (node2.get(0).equals(node)) { // only do it if they match * annotated.put(node2, cntr2); * } * } * * // upto * * List answers = new ArrayList(); * System.out.println("----"); * System.out.println("Sorted descending support * KL"); * Collections.sort(answers, * new Comparator() { * public int compare(Object o1, Object o2) { * Pair p1 = (Pair) o1; * Pair p2 = (Pair) o2; * Double p12 = (Double) p1.second(); * Double p22 = (Double) p2.second(); * return p22.compareTo(p12); * } * }); * for (int i = 0, size = answers.size(); i < size; i++) { * Pair p = (Pair) answers.get(i); * double psd = ((Double) p.second()).doubleValue(); * System.out.println(p.first() + ": " + nf.format(psd)); * if (psd >= CUTOFFS[0]) { * List lst = (List) p.first(); * String nd = (String) lst.get(0); * String par = (String) lst.get(1); * for (int j=0; j < CUTOFFS.length; j++) { * if (psd >= CUTOFFS[j]) { * javaSB[j].append("\"").append(nd).append("^"); * javaSB[j].append(par).append("\", "); * } * } * } * } * System.out.println(); * } */ // do value of grandparent foreach (IList <string> node_1 in pRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = pRules[node_1]; double support = (cntr.TotalCount()); if (support < Suppcutoff) { continue; } System.Console.Out.WriteLine("Node " + node_1 + " support is " + support); foreach (IList <string> key in gPRules.Keys) { if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1])) { // only do it if they match ClassicCounter <IList <string> > cntr2 = gPRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node_1 + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(Pair.MakePair(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; string gpar = (string)lst[2]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("~"); javaSB[j].Append(gpar).Append("\", "); } } } } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("All scores:"); IPriorityQueue <IList <string> > pq = Counters.ToPriorityQueue(allScores); while (!pq.IsEmpty()) { IList <string> key = pq.GetFirst(); double score = pq.GetPriority(key); pq.RemoveFirst(); System.Console.Out.WriteLine(key + "\t" + score); } System.Console.Out.WriteLine(" // Automatically generated by ParentAnnotationStats -- preferably don't edit"); for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++) { int len = javaSB[i_1].Length; javaSB[i_1].Replace(len - 2, len, "};"); System.Console.Out.WriteLine(javaSB[i_1]); } System.Console.Out.Write(" public static HashSet splitters = new HashSet(Arrays.asList("); for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--) { if (i_2 == 1) { System.Console.Out.Write("splitters1"); } else { System.Console.Out.Write("selectiveSplit" + i_2 + " ? splitters" + i_2 + " : ("); } } // need to print extra one to close other things open for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--) { System.Console.Out.Write(")"); } System.Console.Out.WriteLine(";"); }
/// <summary> /// Runs the Viterbi algorithm on the sequence model, and then proceeds to efficiently /// backwards decode the best k label sequence assignments. /// </summary> /// <remarks> /// Runs the Viterbi algorithm on the sequence model, and then proceeds to efficiently /// backwards decode the best k label sequence assignments. /// This sequence finder only works on SequenceModel's with rightWindow == 0. /// </remarks> /// <param name="ts">The SequenceModel to find the best k label sequence assignments of</param> /// <param name="k">The number of top-scoring assignments to find.</param> /// <returns>A Counter with k entries that map from a sequence assignment (int array) to a double score</returns> public virtual ICounter <int[]> KBestSequences(ISequenceModel ts, int k) { // Set up tag options int length = ts.Length(); int leftWindow = ts.LeftWindow(); int rightWindow = ts.RightWindow(); if (rightWindow != 0) { throw new ArgumentException("KBestSequenceFinder only works with rightWindow == 0 not " + rightWindow); } int padLength = length + leftWindow + rightWindow; int[][] tags = new int[padLength][]; int[] tagNum = new int[padLength]; for (int pos = 0; pos < padLength; pos++) { tags[pos] = ts.GetPossibleValues(pos); tagNum[pos] = tags[pos].Length; } int[] tempTags = new int[padLength]; // Set up product space sizes int[] productSizes = new int[padLength]; int curProduct = 1; for (int i = 0; i < leftWindow; i++) { curProduct *= tagNum[i]; } for (int pos_1 = leftWindow; pos_1 < padLength; pos_1++) { if (pos_1 > leftWindow + rightWindow) { curProduct /= tagNum[pos_1 - leftWindow - rightWindow - 1]; } // shift off curProduct *= tagNum[pos_1]; // shift on productSizes[pos_1 - rightWindow] = curProduct; } double[][] windowScore = new double[padLength][]; // Score all of each window's options for (int pos_2 = leftWindow; pos_2 < leftWindow + length; pos_2++) { windowScore[pos_2] = new double[productSizes[pos_2]]; Arrays.Fill(tempTags, tags[0][0]); for (int product = 0; product < productSizes[pos_2]; product++) { int p = product; int shift = 1; for (int curPos = pos_2; curPos >= pos_2 - leftWindow; curPos--) { tempTags[curPos] = tags[curPos][p % tagNum[curPos]]; p /= tagNum[curPos]; if (curPos > pos_2) { shift *= tagNum[curPos]; } } if (tempTags[pos_2] == tags[pos_2][0]) { // get all tags at once double[] scores = ts.ScoresOf(tempTags, pos_2); // fill in the relevant windowScores for (int t = 0; t < tagNum[pos_2]; t++) { windowScore[pos_2][product + t * shift] = scores[t]; } } } } // Set up score and backtrace arrays double[][][] score = new double[padLength][][]; int[][][][] trace = new int[padLength][][][]; int[][] numWaysToMake = new int[padLength][]; for (int pos_3 = 0; pos_3 < padLength; pos_3++) { score[pos_3] = new double[productSizes[pos_3]][]; trace[pos_3] = new int[productSizes[pos_3]][][]; // the 2 is for backtrace, and which of the k best for that backtrace numWaysToMake[pos_3] = new int[productSizes[pos_3]]; Arrays.Fill(numWaysToMake[pos_3], 1); for (int product = 0; product < productSizes[pos_3]; product++) { if (pos_3 > leftWindow) { // loop over possible predecessor types int sharedProduct = product / tagNum[pos_3]; int factor = productSizes[pos_3] / tagNum[pos_3]; numWaysToMake[pos_3][product] = 0; for (int newTagNum = 0; newTagNum < tagNum[pos_3 - leftWindow - 1] && numWaysToMake[pos_3][product] < k; newTagNum++) { int predProduct = newTagNum * factor + sharedProduct; numWaysToMake[pos_3][product] += numWaysToMake[pos_3 - 1][predProduct]; } if (numWaysToMake[pos_3][product] > k) { numWaysToMake[pos_3][product] = k; } } score[pos_3][product] = new double[numWaysToMake[pos_3][product]]; Arrays.Fill(score[pos_3][product], double.NegativeInfinity); trace[pos_3][product] = new int[numWaysToMake[pos_3][product]][]; Arrays.Fill(trace[pos_3][product], new int[] { -1, -1 }); } } // Do forward Viterbi algorithm // this is the hottest loop, so cache loop control variables hoping for a little speed.... // loop over the classification spot for (int pos_4 = leftWindow; pos_4 < posMax; pos_4++) { // loop over window product types for (int product = 0; product < productMax; product++) { // check for initial spot double[] scorePos = score[pos_4][product]; int[][] tracePos = trace[pos_4][product]; if (pos_4 == leftWindow) { // no predecessor type scorePos[0] = windowScore[pos_4][product]; } else { // loop over possible predecessor types/k-best int sharedProduct = product / tagNum[pos_4 + rightWindow]; int factor = productSizes[pos_4] / tagNum[pos_4 + rightWindow]; for (int newTagNum = 0; newTagNum < maxTagNum; newTagNum++) { int predProduct = newTagNum * factor + sharedProduct; double[] scorePosPrev = score[pos_4 - 1][predProduct]; for (int k1 = 0; k1 < scorePosPrev.Length; k1++) { double predScore = scorePosPrev[k1] + windowScore[pos_4][product]; if (predScore > scorePos[0]) { // new value higher then lowest value we should keep int k2 = Arrays.BinarySearch(scorePos, predScore); k2 = k2 < 0 ? -k2 - 2 : k2 - 1; // open a spot at k2 by shifting off the lowest value System.Array.Copy(scorePos, 1, scorePos, 0, k2); System.Array.Copy(tracePos, 1, tracePos, 0, k2); scorePos[k2] = predScore; tracePos[k2] = new int[] { predProduct, k1 }; } } } } } } // Project the actual tag sequence int[] whichDerivation = new int[k]; int[] bestCurrentProducts = new int[k]; double[] bestFinalScores = new double[k]; Arrays.Fill(bestFinalScores, double.NegativeInfinity); // just the last guy for (int product_1 = 0; product_1 < productSizes[padLength - 1]; product_1++) { double[] scorePos = score[padLength - 1][product_1]; for (int k1 = scorePos.Length - 1; k1 >= 0 && scorePos[k1] > bestFinalScores[0]; k1--) { int k2 = Arrays.BinarySearch(bestFinalScores, scorePos[k1]); k2 = k2 < 0 ? -k2 - 2 : k2 - 1; // open a spot at k2 by shifting off the lowest value System.Array.Copy(bestFinalScores, 1, bestFinalScores, 0, k2); System.Array.Copy(whichDerivation, 1, whichDerivation, 0, k2); System.Array.Copy(bestCurrentProducts, 1, bestCurrentProducts, 0, k2); bestCurrentProducts[k2] = product_1; whichDerivation[k2] = k1; bestFinalScores[k2] = scorePos[k1]; } } ClassicCounter <int[]> kBestWithScores = new ClassicCounter <int[]>(); for (int k1_1 = k - 1; k1_1 >= 0 && bestFinalScores[k1_1] > double.NegativeInfinity; k1_1--) { int lastProduct = bestCurrentProducts[k1_1]; for (int last = padLength - 1; last >= length - 1 && last >= 0; last--) { tempTags[last] = tags[last][lastProduct % tagNum[last]]; lastProduct /= tagNum[last]; } for (int pos_5 = leftWindow + length - 2; pos_5 >= leftWindow; pos_5--) { int bestNextProduct = bestCurrentProducts[k1_1]; bestCurrentProducts[k1_1] = trace[pos_5 + 1][bestNextProduct][whichDerivation[k1_1]][0]; whichDerivation[k1_1] = trace[pos_5 + 1][bestNextProduct][whichDerivation[k1_1]][1]; tempTags[pos_5 - leftWindow] = tags[pos_5 - leftWindow][bestCurrentProducts[k1_1] / (productSizes[pos_5] / tagNum[pos_5 - leftWindow])]; } kBestWithScores.SetCount(Arrays.CopyOf(tempTags, tempTags.Length), bestFinalScores[k1_1]); } return(kBestWithScores); }
//goldList null if not training public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining) { Annotation doc = sd.doc; sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc); GeneralDataset <string, string> dataset = new RVFDataset <string, string>(); //necessary for 'ScoreBestMention' IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >(); //maps quote to corresponding indices in the dataset IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>(); if (isTraining && goldList.Count != quotes.Count) { throw new Exception("Gold Quote List size doesn't match quote list size!"); } for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++) { int initialSize = dataset.Size(); ICoreMap quote = quotes[quoteIdx]; XMLToAnnotation.GoldQuoteInfo gold = null; if (isTraining) { gold = goldList[quoteIdx]; if (gold.speaker == string.Empty) { continue; } } ICoreMap quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; Pair <int, int> quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation))); // int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class); int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)); //add mentions before quote up to the previous paragraph int rightValue = quoteRun.first - 1; int leftValue = quoteRun.first - 1; //move left value to be the first token idx of the previous paragraph for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--) { ICoreMap sentence = sentences[sentIdx]; if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { continue; } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { //quoteParagraphIdx - 1 for this and prev leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); } else { break; } } IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>(); if (leftValue > -1 && rightValue > -1) { mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue))); } //mentions in next paragraph leftValue = quoteRun.second + 1; rightValue = quoteRun.second + 1; for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++) { ICoreMap sentence = sentences[sentIdx_1]; // if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) { // continue; // } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { //quoteParagraphIdx + 1 rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1; } else { break; } } IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>(); if (leftValue < tokens.Count && rightValue < tokens.Count) { mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue)); } IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>(); Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph); Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph); // System.out.println(candidateMentions.size()); int rankedDistance = 1; int numBackwards = mentionsInPreviousParagraph.Count; foreach (Sieve.MentionData mention in candidateMentions) { IList <CoreLabel> mentionCandidateTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1); ICoreMap mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()]; // if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) { // continue; // } ICounter <string> features = new ClassicCounter <string>(); bool isLeft = true; int distance = quoteRun.first - mention.end; if (distance < 0) { isLeft = false; distance = mention.begin - quoteRun.second; } if (distance < 0) { continue; } //disregard mention-in-quote cases. features.SetCount("wordDistance", distance); IList <CoreLabel> betweenTokens; if (isLeft) { betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first); } else { betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin); } //Punctuation in between foreach (CoreLabel token in betweenTokens) { if (punctuation.Contains(token.Word())) { features.SetCount("punctuationPresence:" + token.Word(), 1); } } // number of mentions away features.SetCount("rankedDistance", rankedDistance); rankedDistance++; if (rankedDistance == numBackwards) { //reset for the forward rankedDistance = 1; } // int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); //third distance: # of paragraphs away int mentionParagraphIdx = -1; ICoreMap sentenceInMentionParagraph = null; int quoteParagraphBeginToken = GetParagraphBeginToken(quoteFirstSentence, sentences); int quoteParagraphEndToken = GetParagraphEndToken(quoteFirstSentence, sentences); if (isLeft) { if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("leftParagraphDistance", 0); mentionParagraphIdx = quoteParagraphIdx; sentenceInMentionParagraph = quoteFirstSentence; } else { int paragraphDistance = 1; int currParagraphIdx = quoteParagraphIdx - paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currParagraphIdx >= 0) { // Paragraph prevParagraph = paragraphs.get(prevParagraphIndex); //extract begin and end tokens of while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx) { currSentenceIdx--; currSentence = sentences[currSentenceIdx]; } int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int prevParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd) { mentionParagraphIdx = currParagraphIdx; sentenceInMentionParagraph = currSentence; features.SetCount("leftParagraphDistance", paragraphDistance); if (paragraphDistance % 2 == 0) { features.SetCount("leftParagraphDistanceEven", 1); } break; } paragraphDistance++; currParagraphIdx--; } } } else { //right if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("rightParagraphDistance", 0); sentenceInMentionParagraph = quoteFirstSentence; mentionParagraphIdx = quoteParagraphIdx; } else { int paragraphDistance = 1; int nextParagraphIndex = quoteParagraphIdx + paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currSentenceIdx < sentences.Count) { while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex) { currSentenceIdx++; currSentence = sentences[currSentenceIdx]; } int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int nextParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd) { sentenceInMentionParagraph = currSentence; features.SetCount("rightParagraphDistance", paragraphDistance); break; } paragraphDistance++; nextParagraphIndex++; } } } //2. mention features if (sentenceInMentionParagraph != null) { int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences); int mentionParagraphEnd = GetParagraphEndToken(sentenceInMentionParagraph, sentences); if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken)) { IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>()); Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd)); features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count); features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1); features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count); //mention ordering in paragraph it is in for (int i = 0; i < namesInMentionParagraph.second.Count; i++) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i])) { features.SetCount("orderInParagraph", i); } } //if mention paragraph is all one quote if (quotesInMentionParagraph.Count == 1) { ICoreMap qInMentionParagraph = quotesInMentionParagraph[0]; if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd) { features.SetCount("mentionParagraphIsInConversation", 1); } else { features.SetCount("mentionParagraphIsInConversation", -1); } } foreach (ICoreMap quoteIMP in quotesInMentionParagraph) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end))) { features.SetCount("mentionInQuote", 1); } } if (features.GetCount("mentionInQuote") != 1) { features.SetCount("mentionNotInQuote", 1); } } } // nearby word syntax types...make sure to check if there are previous or next words // or there will be an array index crash if (mention.begin > 0) { CoreLabel prevWord = tokens[mention.begin - 1]; features.SetCount("prevWordType:" + prevWord.Tag(), 1); if (punctuationForFeatures.Contains(prevWord.Lemma())) { features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1); } } if (mention.end + 1 < tokens.Count) { CoreLabel nextWord = tokens[mention.end + 1]; features.SetCount("nextWordType:" + nextWord.Tag(), 1); if (punctuationForFeatures.Contains(nextWord.Lemma())) { features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1); } } // features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1); //quote paragraph features IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx]; features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count); features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1); features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count); //quote features features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1); for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++) { if (quotesInQuoteParagraph[i_1].Equals(quote)) { features.SetCount("quotePosition", i_1 + 1); } } if (features.GetCount("quotePosition") == 0) { throw new Exception("Check this (equality not working)"); } Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun); foreach (string name in namesData.first) { features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1); } //if quote encompasses entire paragraph if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken) { features.SetCount("isImplicitSpeaker", 1); } else { features.SetCount("isImplicitSpeaker", -1); } //Vocative detection if (mention.type.Equals("name")) { IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))]; Person p = null; if (pList != null) { p = pList[0]; } else { Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end)); if (scanForNamesResultPair.first.Count != 0) { string scanForNamesResultString = scanForNamesResultPair.first[0]; if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString)) { p = sd.characterMap[scanForNamesResultString][0]; } } } if (p != null) { foreach (string name_1 in namesData.first) { if (p.aliases.Contains(name_1)) { features.SetCount("nameInQuote", 1); } } if (quoteParagraphIdx > 0) { // Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1); IList <ICoreMap> quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>()); IList <Pair <int, int> > exclusionList = new List <Pair <int, int> >(); foreach (ICoreMap quoteIPP in quotesInPrevParagraph) { Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation))); exclusionList.Add(quoteRange); foreach (string name_2 in sieve.ScanForNames(quoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphQuote", 1); } } } int sentenceIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sentenceInPrevParagraph = null; for (int i = sentenceIdx - 1; i_1 >= 0; i_1--) { ICoreMap currSentence = sentences[i_1]; if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { sentenceInPrevParagraph = currSentence; break; } } int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences); int prevParagraphEnd = GetParagraphEndToken(sentenceInPrevParagraph, sentences); IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList); foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns) { foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphNonQuote", 1); } } } } } } if (isTraining) { if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end))) { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention"); datum.SetID(int.ToString(dataset.Size())); dataset.Add(datum); mapDatumToMention[dataset.Size()] = mention; } } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } } mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1); } return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset)); }
/// <summary>Takes time linear in number of arcs.</summary> public static ClassicCounter ComputeLambda(TransducerGraph graph) { ArrayList queue = new ArrayList(); ClassicCounter lambda = new ClassicCounter(); ClassicCounter length = new ClassicCounter(); IDictionary first = new Hashtable(); ISet nodes = graph.GetNodes(); foreach (object node in nodes) { lambda.SetCount(node, 0); length.SetCount(node, double.PositiveInfinity); } ISet endNodes = graph.GetEndNodes(); foreach (object o in endNodes) { lambda.SetCount(o, 0); length.SetCount(o, 0); queue.AddLast(o); } // Breadth first search // get the first node from the queue object node_1 = null; try { node_1 = queue.RemoveFirst(); } catch (NoSuchElementException) { } while (node_1 != null) { double oldLen = length.GetCount(node_1); ISet arcs = graph.GetArcsByTarget(node_1); if (arcs != null) { foreach (object arc1 in arcs) { TransducerGraph.Arc arc = (TransducerGraph.Arc)arc1; object newNode = arc.GetSourceNode(); IComparable a = (IComparable)arc.GetInput(); double k = ((double)arc.GetOutput()); double newLen = length.GetCount(newNode); if (newLen == double.PositiveInfinity) { // we are discovering this queue.AddLast(newNode); } IComparable f = (IComparable)first[newNode]; if (newLen == double.PositiveInfinity || (newLen == oldLen + 1 && a.CompareTo(f) < 0)) { // f can't be null, since we have a newLen // we do this to this to newNode when we have new info, possibly many times first[newNode] = a; // ejecting old one if necessary length.SetCount(newNode, oldLen + 1); // this may already be the case lambda.SetCount(newNode, k + lambda.GetCount(node_1)); } } } // get a new node from the queue node_1 = null; try { node_1 = queue.RemoveFirst(); } catch (NoSuchElementException) { } } return(lambda); }
public override ICounter <E> Score() { ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>(); ICounter <E> pos_i = new ClassicCounter <E>(); ICounter <E> neg_i = new ClassicCounter <E>(); ICounter <E> unlab_i = new ClassicCounter <E>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in negPatternsandWords4Label.EntrySet()) { neg_i.SetCount(en.Key, en.Value.Size()); } foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en_1 in unLabeledPatternsandWords4Label.EntrySet()) { unlab_i.SetCount(en_1.Key, en_1.Value.Size()); } foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en_2 in patternsandWords4Label.EntrySet()) { pos_i.SetCount(en_2.Key, en_2.Value.Size()); } ICounter <E> all_i = Counters.Add(pos_i, neg_i); all_i.AddAll(unlab_i); // for (Entry<Integer, ClassicCounter<String>> en : allPatternsandWords4Label // .entrySet()) { // all_i.setCount(en.getKey(), en.getValue().size()); // } ICounter <E> posneg_i = Counters.Add(pos_i, neg_i); ICounter <E> logFi = new ClassicCounter <E>(pos_i); Counters.LogInPlace(logFi); if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogF)) { currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, all_i), logFi); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFPosNeg)) { Redwood.Log("extremePatDebug", "computing rlogfposneg"); currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, posneg_i), logFi); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFUnlabNeg)) { Redwood.Log("extremePatDebug", "computing rlogfunlabeg"); currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, Counters.Add(neg_i, unlab_i)), logFi); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RlogFNeg)) { Redwood.Log("extremePatDebug", "computing rlogfneg"); currentPatternWeights4Label = Counters.Product(Counters.Division(pos_i, neg_i), logFi); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.YanGarber02)) { ICounter <E> acc = Counters.Division(pos_i, Counters.Add(pos_i, neg_i)); double thetaPrecision = 0.8; Counters.RetainAbove(acc, thetaPrecision); ICounter <E> conf = Counters.Product(Counters.Division(pos_i, all_i), logFi); foreach (E p in acc.KeySet()) { currentPatternWeights4Label.SetCount(p, conf.GetCount(p)); } } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LinICML03)) { ICounter <E> acc = Counters.Division(pos_i, Counters.Add(pos_i, neg_i)); double thetaPrecision = 0.8; Counters.RetainAbove(acc, thetaPrecision); ICounter <E> conf = Counters.Product(Counters.Division(Counters.Add(pos_i, Counters.Scale(neg_i, -1)), all_i), logFi); foreach (E p in acc.KeySet()) { currentPatternWeights4Label.SetCount(p, conf.GetCount(p)); } } else { throw new Exception("not implemented " + patternScoring + " . check spelling!"); } } } } } } return(currentPatternWeights4Label); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public override ICounter <E> Score() { ICounter <CandidatePhrase> externalWordWeightsNormalized = null; if (constVars.dictOddsWeights.Contains(label)) { externalWordWeightsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false); } ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>(); bool useFreqPhraseExtractedByPat = false; if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio)) { useFreqPhraseExtractedByPat = true; } IToDoubleFunction <Pair <E, CandidatePhrase> > numeratorScore = null; ICounter <E> numeratorPatWt = this.Convert2OneDim(label, numeratorScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, null, useFreqPhraseExtractedByPat); ICounter <E> denominatorPatWt = null; IToDoubleFunction <Pair <E, CandidatePhrase> > denoScore; if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegUnlabOdds)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RatioAll)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegOdds)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring. Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, true, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, true, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { throw new Exception("Cannot understand patterns scoring"); } } } } } currentPatternWeights4Label = Counters.DivisionNonNaN(numeratorPatWt, denominatorPatWt); //Multiplying by logP if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) { ICounter <E> logpos_i = new ClassicCounter <E>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet()) { logpos_i.SetCount(en.Key, Math.Log(en.Value.Size())); } Counters.MultiplyInPlace(currentPatternWeights4Label, logpos_i); } Counters.RetainNonZeros(currentPatternWeights4Label); return(currentPatternWeights4Label); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
static ChineseQuantifiableEntityNormalizer() { //Entity types that are quantifiable // used by money // used by money // Patterns we need // TODO (yuhao): here we are not considering 1) negative numbers, 2) Chinese traditional characters // This is the all-literal-number-characters sequence, excluding unit characters like 十 or 万 // The decimal part of a float number should be exactly literal number sequence without units // Used by quantity modifiers // All the tags we need // static initialization of useful properties quantifiable = Generics.NewHashSet(); quantifiable.Add(NumberTag); quantifiable.Add(DateTag); quantifiable.Add(TimeTag); quantifiable.Add(MoneyTag); quantifiable.Add(PercentTag); quantifiable.Add(OrdinalTag); quantityUnitToValues = new ClassicCounter <string>(); quantityUnitToValues.SetCount("十", 10.0); quantityUnitToValues.SetCount("百", 100.0); quantityUnitToValues.SetCount("千", 1000.0); quantityUnitToValues.SetCount("万", 10000.0); quantityUnitToValues.SetCount("亿", 100000000.0); wordsToValues = new ClassicCounter <string>(); wordsToValues.SetCount("零", 0.0); wordsToValues.SetCount("〇", 0.0); wordsToValues.SetCount("一", 1.0); wordsToValues.SetCount("二", 2.0); wordsToValues.SetCount("两", 2.0); wordsToValues.SetCount("三", 3.0); wordsToValues.SetCount("四", 4.0); wordsToValues.SetCount("五", 5.0); wordsToValues.SetCount("六", 6.0); wordsToValues.SetCount("七", 7.0); wordsToValues.SetCount("八", 8.0); wordsToValues.SetCount("九", 9.0); wordsToValues.AddAll(quantityUnitToValues); // all units are also quantifiable individually multiCharCurrencyWords = Generics.NewHashMap(); multiCharCurrencyWords["美元"] = '$'; multiCharCurrencyWords["美分"] = '$'; multiCharCurrencyWords["英镑"] = '£'; multiCharCurrencyWords["先令"] = '£'; multiCharCurrencyWords["便士"] = '£'; multiCharCurrencyWords["欧元"] = '€'; multiCharCurrencyWords["日元"] = '¥'; multiCharCurrencyWords["韩元"] = '₩'; oneCharCurrencyWords = Generics.NewHashMap(); oneCharCurrencyWords["刀"] = '$'; oneCharCurrencyWords["镑"] = '£'; oneCharCurrencyWords["元"] = '元'; // We follow the tradition in English to use 元 instead of ¥ for RMB // For all other currency, we use default currency symbol $ yearModifiers = Generics.NewHashMap(); yearModifiers["前"] = -2; yearModifiers["去"] = -1; yearModifiers["上"] = -1; yearModifiers["今"] = 0; yearModifiers["同"] = 0; yearModifiers["此"] = 0; yearModifiers["该"] = 0; yearModifiers["本"] = 0; yearModifiers["明"] = 1; yearModifiers["来"] = 1; yearModifiers["下"] = 1; yearModifiers["后"] = 2; monthDayModifiers = Generics.NewHashMap(); monthDayModifiers["昨"] = -1; monthDayModifiers["上"] = -1; monthDayModifiers["今"] = 0; monthDayModifiers["同"] = 0; monthDayModifiers["此"] = 0; monthDayModifiers["该"] = 0; monthDayModifiers["本"] = 0; monthDayModifiers["来"] = 1; monthDayModifiers["明"] = 1; monthDayModifiers["下"] = 1; fullDigitToHalfDigit = Generics.NewHashMap(); fullDigitToHalfDigit["1"] = "1"; fullDigitToHalfDigit["2"] = "2"; fullDigitToHalfDigit["3"] = "3"; fullDigitToHalfDigit["4"] = "4"; fullDigitToHalfDigit["5"] = "5"; fullDigitToHalfDigit["6"] = "6"; fullDigitToHalfDigit["7"] = "7"; fullDigitToHalfDigit["8"] = "8"; fullDigitToHalfDigit["9"] = "9"; fullDigitToHalfDigit["0"] = "0"; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat) { // if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) { // Data.loadGoogleNGrams(); // } ICounter <E> patterns = new ClassicCounter <E>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>(); double externalWtsDefault = 0.5; ICounter <string> classifierScores = null; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { foreach (CandidatePhrase gc in allCandidatePhrases) { string g = gc.GetPhrase(); if (constVars.usePatternEvalEditDistOther) { editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePatternEvalEditDistSame) { editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc)); } if (constVars.usePatternEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g)); } } if (constVars.usePatternEvalWordClass) { int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, externalWtsDefault); } } } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); } if (constVars.usePatternEvalDomainNgram) { domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); } if (constVars.usePatternEvalWordClass) { externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { Properties props2 = new Properties(); props2.PutAll(props); props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt"); ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars); System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile")); ArgumentParser.FillOptions(typeof(Data), props2); classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true); } } ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet()) { foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet()) { CandidatePhrase word = en2.Key; ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); double score = 1; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { if (cachedScoresForThisIter.ContainsKey(word)) { score = cachedScoresForThisIter.GetCount(word); } else { if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word)) { score = 1; } else { if (constVars.usePatternEvalSemanticOdds) { double semanticClassOdds = 1; if (dictOddsWordWeights.ContainsKey(word)) { semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds); } if (constVars.usePatternEvalGoogleNgram) { double gscore = 0; if (googleNgramNormScores.ContainsKey(word)) { gscore = 1 - googleNgramNormScores.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore); } if (constVars.usePatternEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = 1 - domainNgramNormScores.GetCount(word); } else { domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePatternEvalWordClass) { double externalFeatureWt = externalWtsDefault; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePatternEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word)); } if (constVars.usePatternEvalEditDistSame) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word)); } // taking average score = Counters.Mean(scoreslist); phInPatScores.SetCounter(word, scoreslist); } cachedScoresForThisIter.SetCount(word, score); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { score = 1 - classifierScores.GetCount(word); } } // score = 1 - scorePhrases.scoreUsingClassifer(classifier, // e.getKey(), label, true, null, null, dictOddsWordWeights); // throw new RuntimeException("not implemented yet"); if (useFreqPhraseExtractedByPat) { score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word)); } if (constVars.sqrtPatScore) { patterns.IncrementCount(en.Key, Math.Sqrt(score)); } else { patterns.IncrementCount(en.Key, score); } } } return(patterns); }