/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary> /// <param name="datum">with a collection of features.</param> /// <param name="featureDocCounts">a counter of doc-count for each feature.</param> /// <returns>RVFDatum with l1-normalized tf-idf features.</returns> public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts) { ICounter <F> tfidfFeatures = new ClassicCounter <F>(); foreach (F feature in datum.AsFeatures()) { if (featureDocCounts.ContainsKey(feature)) { tfidfFeatures.IncrementCount(feature, 1.0); } } double l1norm = 0; foreach (F feature_1 in tfidfFeatures.KeySet()) { double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5)); double tf = tfidfFeatures.GetCount(feature_1); tfidfFeatures.SetCount(feature_1, tf * idf); l1norm += tf * idf; } foreach (F feature_2 in tfidfFeatures.KeySet()) { double tfidf = tfidfFeatures.GetCount(feature_2); tfidfFeatures.SetCount(feature_2, tfidf / l1norm); } RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label()); return(rvfDatum); }
public override Pair <UnaryGrammar, BinaryGrammar> FormResult() { stateIndex.AddToIndex(LexiconConstants.BoundaryTag); BinaryGrammar bg = new BinaryGrammar(stateIndex); UnaryGrammar ug = new UnaryGrammar(stateIndex); // add unaries foreach (UnaryRule ur in unaryRules) { ur.score = (float)Math.Log(unaryRuleCounter.GetCount(ur) / symbolCounter.GetCount(stateIndex.Get(ur.parent))); if (op.trainOptions.CompactGrammar() >= 4) { ur.score = (float)unaryRuleCounter.GetCount(ur); } ug.AddRule(ur); } // add binaries foreach (BinaryRule br in binaryRules) { br.score = (float)Math.Log((binaryRuleCounter.GetCount(br) - op.trainOptions.ruleDiscount) / symbolCounter.GetCount(stateIndex.Get(br.parent))); if (op.trainOptions.CompactGrammar() >= 4) { br.score = (float)binaryRuleCounter.GetCount(br); } bg.AddRule(br); } return(new Pair <UnaryGrammar, BinaryGrammar>(ug, bg)); }
/// <summary>Checks whether a word is in the lexicon.</summary> /// <remarks> /// Checks whether a word is in the lexicon. This version works even while /// compiling lexicon with current counters (rather than using the compiled /// rulesWithWord array). /// TODO: The previous version would insert rules into the /// wordNumberer. Is that the desired behavior? Why not test in /// some way that doesn't affect the index? For example, start by /// testing wordIndex.contains(word). /// </remarks> /// <param name="word">The word as a String</param> /// <returns>Whether the word is in the lexicon</returns> public virtual bool IsKnown(string word) { if (!wordIndex.Contains(word)) { return(false); } IntTaggedWord iW = new IntTaggedWord(wordIndex.IndexOf(word), nullTag); return(seenCounter.GetCount(iW) > 0.0); }
/// <summary>Takes time linear in number of arcs.</summary> public virtual TransducerGraph PushLambdas(TransducerGraph graph, ClassicCounter lambda) { TransducerGraph result = null; result = graph.Clone(); // arcs have been copied too so we don't mess up graph ICollection <TransducerGraph.Arc> arcs = result.GetArcs(); foreach (TransducerGraph.Arc arc in arcs) { double sourceLambda = lambda.GetCount(arc.GetSourceNode()); double targetLambda = lambda.GetCount(arc.GetTargetNode()); double oldOutput = ((double)arc.GetOutput()); double newOutput = oldOutput + targetLambda - sourceLambda; arc.SetOutput(newOutput); } // do initialOutput double startLambda = lambda.GetCount(result.GetStartNode()); if (startLambda != 0.0) { // add it back to the outbound arcs from start (instead of adding it to the initialOutput) ICollection <TransducerGraph.Arc> startArcs = result.GetArcsBySource(result.GetStartNode()); foreach (TransducerGraph.Arc arc_1 in startArcs) { double oldOutput = ((double)arc_1.GetOutput()); double newOutput = oldOutput + startLambda; arc_1.SetOutput(newOutput); } } // do finalOutput foreach (object o in result.GetEndNodes()) { double endLambda = lambda.GetCount(o); if (endLambda != 0.0) { // subtract it from the inbound arcs to end (instead of subtracting it from the finalOutput) ICollection <TransducerGraph.Arc> endArcs = result.GetArcsByTarget(o); foreach (TransducerGraph.Arc arc_1 in endArcs) { double oldOutput = ((double)arc_1.GetOutput()); double newOutput = oldOutput - endLambda; arc_1.SetOutput(newOutput); } } } return(result); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private void ReadObject(ObjectInputStream stream) { stream.DefaultReadObject(); // log.info("Before decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> compressedArgC = argCounter; argCounter = new ClassicCounter <IntDependency>(); ClassicCounter <IntDependency> compressedStopC = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency d in compressedArgC.KeySet()) { double count = compressedArgC.GetCount(d); ExpandArg(d, d.distance, count); } foreach (IntDependency d_1 in compressedStopC.KeySet()) { double count = compressedStopC.GetCount(d_1); ExpandStop(d_1, d_1.distance, count, false); } // log.info("After decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); expandDependencyMap = null; }
public override IUnknownWordModel FinishTraining() { if (useGT) { unknownGTTrainer.FinishTraining(); } foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c) { /* outer iteration is over tags */ ILabel key = entry.Key; ClassicCounter <string> wc = entry.Value; // counts for words given a tag if (!tagHash.Contains(key)) { tagHash[key] = new ClassicCounter <string>(); } /* the UNKNOWN sequence is assumed to be seen once in each tag */ // This is sort of broken, but you can regard it as a Dirichlet prior. tc.IncrementCount(key); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); /* inner iteration is over words */ foreach (string end in wc.KeySet()) { double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key))); // p(sig|tag) tagHash[key].SetCount(end, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding); return(model); }
public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } // scan data string word = tw.Word(); string subString = model.GetSignature(word, loc); ILabel tag = new Tag(tw.Tag()); if (!c.Contains(tag)) { c[tag] = new ClassicCounter <string>(); } c[tag].IncrementCount(subString, weight); tc.IncrementCount(tag, weight); seenEnd.Add(subString); string tagStr = tw.Tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight); } } }
public virtual void FinishTraining() { // testing: get some stats here log.Info("Total tokens: " + tokens); log.Info("Total WordTag types: " + wtCount.KeySet().Count); log.Info("Total tag types: " + tagCount.KeySet().Count); log.Info("Total word types: " + seenWords.Count); /* find # of once-seen words for each tag */ foreach (Pair <string, string> wt in wtCount.KeySet()) { if (wtCount.GetCount(wt) == 1) { r1.IncrementCount(wt.Second()); } } /* find # of unseen words for each tag */ foreach (string tag in tagCount.KeySet()) { foreach (string word in seenWords) { Pair <string, string> wt_1 = new Pair <string, string>(word, tag); if (!(wtCount.KeySet().Contains(wt_1))) { r0.IncrementCount(tag); } } } /* set unseen word probability for each tag */ foreach (string tag_1 in tagCount.KeySet()) { float logprob = (float)Math.Log(r1.GetCount(tag_1) / (tagCount.GetCount(tag_1) * r0.GetCount(tag_1))); unknownGT[tag_1] = float.ValueOf(logprob); } }
public override IUnknownWordModel FinishTraining() { // Map<String,Float> unknownGT = null; if (useGT) { unknownGTTrainer.FinishTraining(); } // unknownGT = unknownGTTrainer.unknownGT; foreach (ILabel tagLab in c.Keys) { // outer iteration is over tags as Labels ClassicCounter <string> wc = c[tagLab]; // counts for words given a tag if (!tagHash.Contains(tagLab)) { tagHash[tagLab] = new ClassicCounter <string>(); } // the UNKNOWN first character is assumed to be seen once in // each tag // this is really sort of broken! (why??) tc.IncrementCount(tagLab); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); // inner iteration is over words as strings foreach (string first in wc.KeySet()) { double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab)); tagHash[tagLab].SetCount(first, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding); return(model); }
/// <summary>Trains this lexicon on the Collection of trees.</summary> public override void Train(TaggedWord tw, int loc, double weight) { IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag); IntTaggedWord iW = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw; if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { // it's an entirely unknown word int s = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word)); IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag); IntTaggedWord iS = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag); unSeenCounter.IncrementCount(iTS, weight); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iS, weight); unSeenCounter.IncrementCount(i, weight); } } }
/// <exception cref="System.IO.IOException"/> private void WriteObject(ObjectOutputStream stream) { // log.info("\nBefore compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> fullArgCounter = argCounter; argCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency in fullArgCounter.KeySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency)); } } ClassicCounter <IntDependency> fullStopCounter = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency_1 in fullStopCounter.KeySet()) { if (dependency_1.head.word != -1) { stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1)); } } // log.info("After compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); stream.DefaultWriteObject(); argCounter = fullArgCounter; stopCounter = fullStopCounter; }
/// <summary>Check if a unit exists in the literal string.</summary> /// <remarks> /// Check if a unit exists in the literal string. If so, parse it by making use of /// the compositionality; otherwise return null. /// </remarks> /// <param name="s"/> /// <param name="unit"/> /// <returns/> private static double CompositeAtUnitIfExists(string s, string unit) { // invalid unit if (!quantityUnitToValues.ContainsKey(unit)) { return(null); } int idx = s.IndexOf(unit); if (idx != -1) { double first = double.ValueOf(1.0); // Here we need special handling for 十 and 百 when they occur as the first char // As in Chinese 十二 is very common, 百二十 is sometimes valid as well. if (("十".Equals(unit) || "百".Equals(unit)) && idx == 0) { } else { // do nothing // otherwise we try to parse the value before the unit first = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, 0, idx)); } double second = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, idx + 1)); if (first != null && second != null) { return(double.ValueOf(first * quantityUnitToValues.GetCount(unit) + second)); } } // return null if unit is not present or fails to parse return(null); }
protected internal override void Calculate(double[] x) { classifier.SetWeights(To2D(x)); if (derivative == null) { derivative = new double[x.Length]; } else { Arrays.Fill(derivative, 0.0); } ICounter <Triple <int, int, int> > feature2classPairDerivatives = new ClassicCounter <Triple <int, int, int> >(); value = 0.0; for (int n = 0; n < geFeatures.Count; n++) { //F feature = geFeatures.get(n); double[] modelDist = new double[numClasses]; Arrays.Fill(modelDist, 0); //go over the unlabeled active data to compute expectations IList <int> activeData = geFeature2DatumList[n]; foreach (int activeDatum in activeData) { IDatum <L, F> datum = unlabeledDataList[activeDatum]; double[] probs = GetModelProbs(datum); for (int c = 0; c < numClasses; c++) { modelDist[c] += probs[c]; } UpdateDerivative(datum, probs, feature2classPairDerivatives); } //computes p(y_d)*(1-p(y_d))*f_d for all active features. //now compute the value (KL-divergence) and the final value of the derivative. if (activeData.Count > 0) { for (int c = 0; c < numClasses; c++) { modelDist[c] /= activeData.Count; } SmoothDistribution(modelDist); for (int c_1 = 0; c_1 < numClasses; c_1++) { value += -geFeature2EmpiricalDist[n][c_1] * Math.Log(modelDist[c_1]); } for (int f = 0; f < labeledDataset.FeatureIndex().Size(); f++) { for (int c_2 = 0; c_2 < numClasses; c_2++) { int wtIndex = IndexOf(f, c_2); for (int cPrime = 0; cPrime < numClasses; cPrime++) { derivative[wtIndex] += feature2classPairDerivatives.GetCount(new Triple <int, int, int>(f, c_2, cPrime)) * geFeature2EmpiricalDist[n][cPrime] / modelDist[cPrime]; } derivative[wtIndex] /= activeData.Count; } } } } }
private void PrintResultsInternal(PrintWriter pw, ICounter <Pair <string, string> > results, ClassicCounter <string> labelCount) { ClassicCounter <string> correct = new ClassicCounter <string>(); ClassicCounter <string> predictionCount = new ClassicCounter <string>(); bool countGoldLabels = false; if (labelCount == null) { labelCount = new ClassicCounter <string>(); countGoldLabels = true; } foreach (Pair <string, string> predictedActual in results.KeySet()) { string predicted = predictedActual.first; string actual = predictedActual.second; if (predicted.Equals(actual)) { correct.IncrementCount(actual, results.GetCount(predictedActual)); } predictionCount.IncrementCount(predicted, results.GetCount(predictedActual)); if (countGoldLabels) { labelCount.IncrementCount(actual, results.GetCount(predictedActual)); } } DecimalFormat formatter = new DecimalFormat(); formatter.SetMaximumFractionDigits(1); formatter.SetMinimumFractionDigits(1); double totalCount = 0; double totalCorrect = 0; double totalPredicted = 0; pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF"); IList <string> labels = new List <string>(labelCount.KeySet()); labels.Sort(); foreach (string label in labels) { double numcorrect = correct.GetCount(label); double predicted = predictionCount.GetCount(label); double trueCount = labelCount.GetCount(label); double precision = (predicted > 0) ? (numcorrect / predicted) : 0; double recall = numcorrect / trueCount; double f = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0; pw.Println(StringUtils.PadOrTrim(label, MaxLabelLength) + "\t" + numcorrect + "\t" + predicted + "\t" + trueCount + "\t" + formatter.Format(precision * 100) + "\t" + formatter.Format(100 * recall) + "\t" + formatter.Format(100 * f)); if (!RelationMention.IsUnrelatedLabel(label)) { totalCount += trueCount; totalCorrect += numcorrect; totalPredicted += predicted; } } double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0; double recall_1 = totalCorrect / totalCount; double f_1 = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0; pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + formatter.Format(100 * precision_1) + "\t" + formatter.Format(100 * recall_1) + "\t" + formatter.Format(100 * f_1)); }
// Does L1 or L2 using FOBOS and lazy update, so L1 should not be handled in the // objective // Alternatively, you can handle other regularization in the objective, // but then, if the derivative is not sparse, this routine would not be very // efficient. However, might still be okay for CRFs public virtual ICounter <K> Minimize(F function, ICounter <K> x, int maxIterations) { Sayln(" Batch size of: " + batchSize); Sayln(" Data dimension of: " + function.DataSize()); int numBatches = (function.DataSize() - 1) / this.batchSize + 1; Sayln(" Batches per pass through data: " + numBatches); Sayln(" Number of passes is = " + numPasses); Sayln(" Max iterations is = " + maxIterations); ICounter <K> lastUpdated = new ClassicCounter <K>(); int timeStep = 0; Timing total = new Timing(); total.Start(); for (int iter = 0; iter < numPasses; iter++) { double totalObjValue = 0; for (int j = 0; j < numBatches; j++) { int[] selectedData = GetSample(function, this.batchSize); // the core adagrad ICounter <K> gradient = function.DerivativeAt(x, selectedData); totalObjValue = totalObjValue + function.ValueAt(x, selectedData); foreach (K feature in gradient.KeySet()) { double gradf = gradient.GetCount(feature); double prevrate = eta / (Math.Sqrt(sumGradSquare.GetCount(feature)) + soften); double sgsValue = sumGradSquare.IncrementCount(feature, gradf * gradf); double currentrate = eta / (Math.Sqrt(sgsValue) + soften); double testupdate = x.GetCount(feature) - (currentrate * gradient.GetCount(feature)); double lastUpdateTimeStep = lastUpdated.GetCount(feature); double idleinterval = timeStep - lastUpdateTimeStep - 1; lastUpdated.SetCount(feature, (double)timeStep); // does lazy update using idleinterval double trunc = Math.Max(0.0, (Math.Abs(testupdate) - (currentrate + prevrate * idleinterval) * this.lambdaL1)); double trunc2 = trunc * Math.Pow(1 - this.lambdaL2, currentrate + prevrate * idleinterval); double realupdate = Math.Signum(testupdate) * trunc2; if (realupdate < Eps) { x.Remove(feature); } else { x.SetCount(feature, realupdate); } // reporting timeStep++; if (timeStep > maxIterations) { Sayln("Stochastic Optimization complete. Stopped after max iterations"); break; } Sayln(System.Console.Out.Format("Iter %d \t batch: %d \t time=%.2f \t obj=%.4f", iter, timeStep, total.Report() / 1000.0, totalObjValue).ToString()); } } } return(x); }
private static void Display <T>(ClassicCounter <T> c, PrintWriter pw) { IList <T> cats = new List <T>(c.KeySet()); cats.Sort(Counters.ToComparatorDescending(c)); foreach (T ob in cats) { pw.Println(ob + " " + c.GetCount(ob)); } }
public virtual void RunCoref(Document document) { Compressor <string> compressor = new Compressor <string>(); if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>(); foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch)) { foreach (int m1 in e.Value) { pairs[new Pair <int, int>(m1, e.Key)] = true; } } DocumentExamples examples = extractor.Extract(0, document, pairs, compressor); ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >(); foreach (Example mentionPair in examples.examples) { if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor)); } IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet()); mentionPairs.Sort(null); ICollection <int> seenAnaphors = new HashSet <int>(); foreach (Pair <int, int> pair in mentionPairs) { if (seenAnaphors.Contains(pair.second)) { continue; } if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } seenAnaphors.Add(pair.second); Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType; Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType; if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)]) { CorefUtils.MergeCoreferenceClusters(pair, document); } } }
public virtual void DumpStats() { System.Console.Out.WriteLine("%% Counts of nonterminals:"); IList <string> biggestCounts = new List <string>(nonTerms.KeySet()); biggestCounts.Sort(Counters.ToComparatorDescending(nonTerms)); foreach (string str in biggestCounts) { System.Console.Out.WriteLine(str + ": " + nonTerms.GetCount(str)); } }
public override IDependencyGrammar FormResult() { wordIndex.AddToIndex(LexiconConstants.UnknownWord); MLEDependencyGrammar dg = new MLEDependencyGrammar(tlpParams, directional, useDistance, useCoarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex); foreach (IntDependency dependency in dependencyCounter.KeySet()) { dg.AddRule(dependency, dependencyCounter.GetCount(dependency)); } return(dg); }
public static TransducerGraph CreateGraphFromPaths <T>(ClassicCounter <IList <T> > pathCounter, int markovOrder) { TransducerGraph graph = new TransducerGraph(); // empty foreach (IList <T> path in pathCounter.KeySet()) { double count = pathCounter.GetCount(path); AddOnePathToGraph(path, count, markovOrder, graph); } return(graph); }
/// <summary> /// Return the probability (as a real number between 0 and 1) of stopping /// rather than generating another argument at this position. /// </summary> /// <param name="dependency"> /// The dependency used as the basis for stopping on. /// Tags are assumed to be in the TagProjection space. /// </param> /// <returns>The probability of generating this stop probability</returns> protected internal virtual double GetStopProb(IntDependency dependency) { short binDistance = DistanceBin(dependency.distance); IntTaggedWord unknownHead = new IntTaggedWord(-1, dependency.head.tag); IntTaggedWord anyHead = new IntTaggedWord(IntTaggedWord.AnyWordInt, dependency.head.tag); IntDependency temp = new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTWds = stopCounter.GetCount(temp); temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance); double c_stop_hTds = stopCounter.GetCount(temp); temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance); double c_hTWds = stopCounter.GetCount(temp); temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance); double c_hTds = stopCounter.GetCount(temp); double p_stop_hTds = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0); double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop); return(pb_stop_hTWds); }
/// <summary>Need to sort the counter by feature keys and dump it</summary> public static void PrintSVMLightFormat(PrintWriter pw, ClassicCounter <int> c, int classNo) { int[] features = Sharpen.Collections.ToArray(c.KeySet(), new int[c.KeySet().Count]); Arrays.Sort(features); StringBuilder sb = new StringBuilder(); sb.Append(classNo); sb.Append(' '); foreach (int f in features) { sb.Append(f + 1).Append(':').Append(c.GetCount(f)).Append(' '); } pw.Println(sb.ToString()); }
/// <summary>A utility to get useful information out of a CorefMention.</summary> /// <remarks> /// A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are /// associated with this mention, and it returns a score for how much we think this mention should be the canonical /// mention. /// </remarks> /// <param name="doc">The document this mention is referenced into.</param> /// <param name="mention">The mention itself.</param> /// <returns>A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention.</returns> private static Pair <IList <CoreLabel>, double> GrokCorefMention(Annotation doc, CorefChain.CorefMention mention) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> mentionAsTokens = tokens.SubList(mention.startIndex - 1, mention.endIndex - 1); // Try to assess this mention's NER type ICounter <string> nerVotes = new ClassicCounter <string>(); mentionAsTokens.Stream().Filter(null).ForEach(null); string ner = Counters.Argmax(nerVotes, null); double nerCount = nerVotes.GetCount(ner); double nerScore = nerCount * nerCount / ((double)mentionAsTokens.Count); // Return return(Pair.MakePair(mentionAsTokens, nerScore)); }
/// <summary> /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter /// using the actual features and labels. /// </summary> /// <remarks> /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter /// using the actual features and labels. Because this is svm_light, and not svm_struct, the /// weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class /// (which correspond to labelIndex.get(1)) are just the negation of one another. /// </remarks> private ClassicCounter <Pair <F, L> > ConvertSVMLightWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex) { ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >(); foreach (int i in weights.KeySet()) { F f = featureIndex.Get(i - 1); double w = weights.GetCount(i); // the first guy in the labelIndex was the +1 class and the second guy // was the -1 class newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(0)), w); newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(1)), -w); } return(newWeights); }
/// <summary> /// Converts the svm_struct weight Counter (in which the weight for a feature/label pair /// correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter /// using the actual features and labels. /// </summary> private ClassicCounter <Pair <F, L> > ConvertSVMStructWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex) { // int numLabels = labelIndex.size(); int numFeatures = featureIndex.Size(); ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >(); foreach (int i in weights.KeySet()) { L l = labelIndex.Get((i - 1) / numFeatures); // integer division on purpose F f = featureIndex.Get((i - 1) % numFeatures); double w = weights.GetCount(i); newWeights.IncrementCount(new Pair <F, L>(f, l), w); } return(newWeights); }
private static void Display <T>(ClassicCounter <T> c, int num, PrintWriter pw) { IList <T> rules = new List <T>(c.KeySet()); rules.Sort(Counters.ToComparatorDescending(c)); int rSize = rules.Count; if (num > rSize) { num = rSize; } for (int i = 0; i < num; i++) { pw.Println(rules[i] + " " + c.GetCount(rules[i])); } }
/// <summary>Trains the first-character based unknown word model.</summary> /// <param name="tw">The word we are currently training on</param> /// <param name="loc">The position of that word</param> /// <param name="weight">The weight to give this word in terms of training</param> public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } string word = tw.Word(); ILabel tagL = new Tag(tw.Tag()); string first = Sharpen.Runtime.Substring(word, 0, 1); if (useUnicodeType) { char ch = word[0]; int type = char.GetType(ch); if (type != char.OtherLetter) { // standard Chinese characters are of type "OTHER_LETTER"!! first = int.ToString(type); } } string tag = tw.Tag(); if (!c.Contains(tagL)) { c[tagL] = new ClassicCounter <string>(); } c[tagL].IncrementCount(first, weight); tc.IncrementCount(tagL, weight); seenFirst.Add(first); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(iTotal, weight); } } }
public virtual void RunCoref(Document document) { IList <Mention> sortedMentions = CorefUtils.GetSortedMentions(document); IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >(); foreach (Mention m in sortedMentions) { IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m.headIndex, null); withIndex.Add(m); } SimpleMatrix documentEmbedding = embeddingExtractor.GetDocumentEmbedding(document); IDictionary <int, SimpleMatrix> antecedentEmbeddings = new Dictionary <int, SimpleMatrix>(); IDictionary <int, SimpleMatrix> anaphorEmbeddings = new Dictionary <int, SimpleMatrix>(); ICounter <int> anaphoricityScores = new ClassicCounter <int>(); foreach (Mention m_1 in sortedMentions) { SimpleMatrix mentionEmbedding = embeddingExtractor.GetMentionEmbeddings(m_1, documentEmbedding); antecedentEmbeddings[m_1.mentionID] = model.GetAntecedentEmbedding(mentionEmbedding); anaphorEmbeddings[m_1.mentionID] = model.GetAnaphorEmbedding(mentionEmbedding); anaphoricityScores.IncrementCount(m_1.mentionID, model.GetAnaphoricityScore(mentionEmbedding, featureExtractor.GetAnaphoricityFeatures(m_1, document, mentionsByHeadIndex))); } IDictionary <int, IList <int> > mentionToCandidateAntecedents = CorefUtils.HeuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch); foreach (KeyValuePair <int, IList <int> > e in mentionToCandidateAntecedents) { double bestScore = anaphoricityScores.GetCount(e.Key) - 50 * (greedyness - 0.5); int m_2 = e.Key; int antecedent = null; foreach (int ca in e.Value) { double score = model.GetPairwiseScore(antecedentEmbeddings[ca], anaphorEmbeddings[m_2], featureExtractor.GetPairFeatures(new Pair <int, int>(ca, m_2), document, mentionsByHeadIndex)); if (score > bestScore) { bestScore = score; antecedent = ca; } } if (antecedent != null) { CorefUtils.MergeCoreferenceClusters(new Pair <int, int>(antecedent, m_2), document); } } }
private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, IList <Pair <int, int> > mentionPairs, ICounter <Pair <int, int> > scores) { ICounter <string> features = new ClassicCounter <string>(); double maxScore = 0; double minScore = 1; ICounter <string> totals = new ClassicCounter <string>(); ICounter <string> totalsLog = new ClassicCounter <string>(); ICounter <string> counts = new ClassicCounter <string>(); foreach (Pair <int, int> mentionPair in mentionPairs) { if (!scores.ContainsKey(mentionPair)) { mentionPair = new Pair <int, int>(mentionPair.second, mentionPair.first); } double score = scores.GetCount(mentionPair); double logScore = CappedLog(score); string mt1 = doc.mentionTypes[mentionPair.first]; string mt2 = doc.mentionTypes[mentionPair.second]; mt1 = mt1.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL"; mt2 = mt2.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL"; string conj = "_" + mt1 + "_" + mt2; maxScore = Math.Max(maxScore, score); minScore = Math.Min(minScore, score); totals.IncrementCount(string.Empty, score); totalsLog.IncrementCount(string.Empty, logScore); counts.IncrementCount(string.Empty); totals.IncrementCount(conj, score); totalsLog.IncrementCount(conj, logScore); counts.IncrementCount(conj); } features.IncrementCount("max", maxScore); features.IncrementCount("min", minScore); foreach (string key in counts.KeySet()) { features.IncrementCount("avg" + key, totals.GetCount(key) / mentionPairs.Count); features.IncrementCount("avgLog" + key, totalsLog.GetCount(key) / mentionPairs.Count); } return(features); }
// todo: Fix javadoc, have unit tested /// <summary>Print SVM Light Format file.</summary> /// <remarks> /// Print SVM Light Format file. /// The following comments are no longer applicable because I am /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009. /// If the Dataset has more than 2 classes, then it /// prints using the label index (+1) (for svm_struct). If it is 2 classes, then the labelIndex.get(0) /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light). /// </remarks> public virtual void PrintSVMLightFormat(PrintWriter pw) { //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter // old comment: // the following code commented out by Ramesh ([email protected]) 12/17/2009. // why not simply print the exact id of the label instead of mapping to some values?? // new comment: // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels, // e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010) string[] labelMap = MakeSvmLabelMap(); for (int i = 0; i < size; i++) { RVFDatum <L, F> d = GetRVFDatum(i); ICounter <F> c = d.AsFeaturesCounter(); ClassicCounter <int> printC = new ClassicCounter <int>(); foreach (F f in c.KeySet()) { printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f)); } int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]); Arrays.Sort(features); StringBuilder sb = new StringBuilder(); sb.Append(labelMap[labels[i]]).Append(' '); // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions! /* Old code: assumes that F is Integer.... * * for (int f: features) { * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" "); * } */ //I think this is what was meant (using printC rather than c), but not sure // ~Sarah Spikes ([email protected]) foreach (int f_1 in features) { sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' '); } pw.Println(sb.ToString()); } }