/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary>
        /// <param name="datum">with a collection of features.</param>
        /// <param name="featureDocCounts">a counter of doc-count for each feature.</param>
        /// <returns>RVFDatum with l1-normalized tf-idf features.</returns>
        public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts)
        {
            ICounter <F> tfidfFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                if (featureDocCounts.ContainsKey(feature))
                {
                    tfidfFeatures.IncrementCount(feature, 1.0);
                }
            }
            double l1norm = 0;

            foreach (F feature_1 in tfidfFeatures.KeySet())
            {
                double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5));
                double tf  = tfidfFeatures.GetCount(feature_1);
                tfidfFeatures.SetCount(feature_1, tf * idf);
                l1norm += tf * idf;
            }
            foreach (F feature_2 in tfidfFeatures.KeySet())
            {
                double tfidf = tfidfFeatures.GetCount(feature_2);
                tfidfFeatures.SetCount(feature_2, tfidf / l1norm);
            }
            RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label());

            return(rvfDatum);
        }
Пример #2
0
        public override Pair <UnaryGrammar, BinaryGrammar> FormResult()
        {
            stateIndex.AddToIndex(LexiconConstants.BoundaryTag);
            BinaryGrammar bg = new BinaryGrammar(stateIndex);
            UnaryGrammar  ug = new UnaryGrammar(stateIndex);

            // add unaries
            foreach (UnaryRule ur in unaryRules)
            {
                ur.score = (float)Math.Log(unaryRuleCounter.GetCount(ur) / symbolCounter.GetCount(stateIndex.Get(ur.parent)));
                if (op.trainOptions.CompactGrammar() >= 4)
                {
                    ur.score = (float)unaryRuleCounter.GetCount(ur);
                }
                ug.AddRule(ur);
            }
            // add binaries
            foreach (BinaryRule br in binaryRules)
            {
                br.score = (float)Math.Log((binaryRuleCounter.GetCount(br) - op.trainOptions.ruleDiscount) / symbolCounter.GetCount(stateIndex.Get(br.parent)));
                if (op.trainOptions.CompactGrammar() >= 4)
                {
                    br.score = (float)binaryRuleCounter.GetCount(br);
                }
                bg.AddRule(br);
            }
            return(new Pair <UnaryGrammar, BinaryGrammar>(ug, bg));
        }
Пример #3
0
        /// <summary>Checks whether a word is in the lexicon.</summary>
        /// <remarks>
        /// Checks whether a word is in the lexicon. This version works even while
        /// compiling lexicon with current counters (rather than using the compiled
        /// rulesWithWord array).
        /// TODO: The previous version would insert rules into the
        /// wordNumberer.  Is that the desired behavior?  Why not test in
        /// some way that doesn't affect the index?  For example, start by
        /// testing wordIndex.contains(word).
        /// </remarks>
        /// <param name="word">The word as a String</param>
        /// <returns>Whether the word is in the lexicon</returns>
        public virtual bool IsKnown(string word)
        {
            if (!wordIndex.Contains(word))
            {
                return(false);
            }
            IntTaggedWord iW = new IntTaggedWord(wordIndex.IndexOf(word), nullTag);

            return(seenCounter.GetCount(iW) > 0.0);
        }
Пример #4
0
        /// <summary>Takes time linear in number of arcs.</summary>
        public virtual TransducerGraph PushLambdas(TransducerGraph graph, ClassicCounter lambda)
        {
            TransducerGraph result = null;

            result = graph.Clone();
            // arcs have been copied too so we don't mess up graph
            ICollection <TransducerGraph.Arc> arcs = result.GetArcs();

            foreach (TransducerGraph.Arc arc in arcs)
            {
                double sourceLambda = lambda.GetCount(arc.GetSourceNode());
                double targetLambda = lambda.GetCount(arc.GetTargetNode());
                double oldOutput    = ((double)arc.GetOutput());
                double newOutput    = oldOutput + targetLambda - sourceLambda;
                arc.SetOutput(newOutput);
            }
            // do initialOutput
            double startLambda = lambda.GetCount(result.GetStartNode());

            if (startLambda != 0.0)
            {
                // add it back to the outbound arcs from start (instead of adding it to the initialOutput)
                ICollection <TransducerGraph.Arc> startArcs = result.GetArcsBySource(result.GetStartNode());
                foreach (TransducerGraph.Arc arc_1 in startArcs)
                {
                    double oldOutput = ((double)arc_1.GetOutput());
                    double newOutput = oldOutput + startLambda;
                    arc_1.SetOutput(newOutput);
                }
            }
            // do finalOutput
            foreach (object o in result.GetEndNodes())
            {
                double endLambda = lambda.GetCount(o);
                if (endLambda != 0.0)
                {
                    // subtract it from the inbound arcs to end (instead of subtracting it from the finalOutput)
                    ICollection <TransducerGraph.Arc> endArcs = result.GetArcsByTarget(o);
                    foreach (TransducerGraph.Arc arc_1 in endArcs)
                    {
                        double oldOutput = ((double)arc_1.GetOutput());
                        double newOutput = oldOutput - endLambda;
                        arc_1.SetOutput(newOutput);
                    }
                }
            }
            return(result);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private void ReadObject(ObjectInputStream stream)
        {
            stream.DefaultReadObject();
            //    log.info("Before decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> compressedArgC = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            ClassicCounter <IntDependency> compressedStopC = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency d in compressedArgC.KeySet())
            {
                double count = compressedArgC.GetCount(d);
                ExpandArg(d, d.distance, count);
            }
            foreach (IntDependency d_1 in compressedStopC.KeySet())
            {
                double count = compressedStopC.GetCount(d_1);
                ExpandStop(d_1, d_1.distance, count, false);
            }
            //    log.info("After decompression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            expandDependencyMap = null;
        }
 public override IUnknownWordModel FinishTraining()
 {
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c)
     {
         /* outer iteration is over tags */
         ILabel key = entry.Key;
         ClassicCounter <string> wc = entry.Value;
         // counts for words given a tag
         if (!tagHash.Contains(key))
         {
             tagHash[key] = new ClassicCounter <string>();
         }
         /* the UNKNOWN sequence is assumed to be seen once in each tag */
         // This is sort of broken, but you can regard it as a Dirichlet prior.
         tc.IncrementCount(key);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         /* inner iteration is over words */
         foreach (string end in wc.KeySet())
         {
             double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key)));
             // p(sig|tag)
             tagHash[key].SetCount(end, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding);
     return(model);
 }
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            // scan data
            string word      = tw.Word();
            string subString = model.GetSignature(word, loc);
            ILabel tag       = new Tag(tw.Tag());

            if (!c.Contains(tag))
            {
                c[tag] = new ClassicCounter <string>();
            }
            c[tag].IncrementCount(subString, weight);
            tc.IncrementCount(tag, weight);
            seenEnd.Add(subString);
            string        tagStr = tw.Tag();
            IntTaggedWord iW     = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight);
                }
            }
        }
 public virtual void FinishTraining()
 {
     // testing: get some stats here
     log.Info("Total tokens: " + tokens);
     log.Info("Total WordTag types: " + wtCount.KeySet().Count);
     log.Info("Total tag types: " + tagCount.KeySet().Count);
     log.Info("Total word types: " + seenWords.Count);
     /* find # of once-seen words for each tag */
     foreach (Pair <string, string> wt in wtCount.KeySet())
     {
         if (wtCount.GetCount(wt) == 1)
         {
             r1.IncrementCount(wt.Second());
         }
     }
     /* find # of unseen words for each tag */
     foreach (string tag in tagCount.KeySet())
     {
         foreach (string word in seenWords)
         {
             Pair <string, string> wt_1 = new Pair <string, string>(word, tag);
             if (!(wtCount.KeySet().Contains(wt_1)))
             {
                 r0.IncrementCount(tag);
             }
         }
     }
     /* set unseen word probability for each tag */
     foreach (string tag_1 in tagCount.KeySet())
     {
         float logprob = (float)Math.Log(r1.GetCount(tag_1) / (tagCount.GetCount(tag_1) * r0.GetCount(tag_1)));
         unknownGT[tag_1] = float.ValueOf(logprob);
     }
 }
 public override IUnknownWordModel FinishTraining()
 {
     // Map<String,Float> unknownGT = null;
     if (useGT)
     {
         unknownGTTrainer.FinishTraining();
     }
     // unknownGT = unknownGTTrainer.unknownGT;
     foreach (ILabel tagLab in c.Keys)
     {
         // outer iteration is over tags as Labels
         ClassicCounter <string> wc = c[tagLab];
         // counts for words given a tag
         if (!tagHash.Contains(tagLab))
         {
             tagHash[tagLab] = new ClassicCounter <string>();
         }
         // the UNKNOWN first character is assumed to be seen once in
         // each tag
         // this is really sort of broken!  (why??)
         tc.IncrementCount(tagLab);
         wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0);
         // inner iteration is over words  as strings
         foreach (string first in wc.KeySet())
         {
             double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab));
             tagHash[tagLab].SetCount(first, prob);
         }
     }
     //if (Test.verbose)
     //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding);
     return(model);
 }
Пример #10
0
        /// <summary>Trains this lexicon on the Collection of trees.</summary>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);
            IntTaggedWord iT  = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag);
            IntTaggedWord iW  = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw;

            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    // it's an entirely unknown word
                    int           s   = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word));
                    IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
                    IntTaggedWord iS  = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag);
                    unSeenCounter.IncrementCount(iTS, weight);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iS, weight);
                    unSeenCounter.IncrementCount(i, weight);
                }
            }
        }
        /// <exception cref="System.IO.IOException"/>
        private void WriteObject(ObjectOutputStream stream)
        {
            //    log.info("\nBefore compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            ClassicCounter <IntDependency> fullArgCounter = argCounter;

            argCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency in fullArgCounter.KeySet())
            {
                if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1)
                {
                    argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency));
                }
            }
            ClassicCounter <IntDependency> fullStopCounter = stopCounter;

            stopCounter = new ClassicCounter <IntDependency>();
            foreach (IntDependency dependency_1 in fullStopCounter.KeySet())
            {
                if (dependency_1.head.word != -1)
                {
                    stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1));
                }
            }
            //    log.info("After compression:");
            //    log.info("arg size: " + argCounter.size() + "  total: " + argCounter.totalCount());
            //    log.info("stop size: " + stopCounter.size() + "  total: " + stopCounter.totalCount());
            stream.DefaultWriteObject();
            argCounter  = fullArgCounter;
            stopCounter = fullStopCounter;
        }
        /// <summary>Check if a unit exists in the literal string.</summary>
        /// <remarks>
        /// Check if a unit exists in the literal string. If so, parse it by making use of
        /// the compositionality; otherwise return null.
        /// </remarks>
        /// <param name="s"/>
        /// <param name="unit"/>
        /// <returns/>
        private static double CompositeAtUnitIfExists(string s, string unit)
        {
            // invalid unit
            if (!quantityUnitToValues.ContainsKey(unit))
            {
                return(null);
            }
            int idx = s.IndexOf(unit);

            if (idx != -1)
            {
                double first = double.ValueOf(1.0);
                // Here we need special handling for 十 and 百 when they occur as the first char
                // As in Chinese 十二 is very common, 百二十 is sometimes valid as well.
                if (("十".Equals(unit) || "百".Equals(unit)) && idx == 0)
                {
                }
                else
                {
                    // do nothing
                    // otherwise we try to parse the value before the unit
                    first = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, 0, idx));
                }
                double second = RecurNormalizeLiteralIntegerString(Sharpen.Runtime.Substring(s, idx + 1));
                if (first != null && second != null)
                {
                    return(double.ValueOf(first * quantityUnitToValues.GetCount(unit) + second));
                }
            }
            // return null if unit is not present or fails to parse
            return(null);
        }
Пример #13
0
        protected internal override void Calculate(double[] x)
        {
            classifier.SetWeights(To2D(x));
            if (derivative == null)
            {
                derivative = new double[x.Length];
            }
            else
            {
                Arrays.Fill(derivative, 0.0);
            }
            ICounter <Triple <int, int, int> > feature2classPairDerivatives = new ClassicCounter <Triple <int, int, int> >();

            value = 0.0;
            for (int n = 0; n < geFeatures.Count; n++)
            {
                //F feature = geFeatures.get(n);
                double[] modelDist = new double[numClasses];
                Arrays.Fill(modelDist, 0);
                //go over the unlabeled active data to compute expectations
                IList <int> activeData = geFeature2DatumList[n];
                foreach (int activeDatum in activeData)
                {
                    IDatum <L, F> datum = unlabeledDataList[activeDatum];
                    double[]      probs = GetModelProbs(datum);
                    for (int c = 0; c < numClasses; c++)
                    {
                        modelDist[c] += probs[c];
                    }
                    UpdateDerivative(datum, probs, feature2classPairDerivatives);
                }
                //computes p(y_d)*(1-p(y_d))*f_d for all active features.
                //now  compute the value (KL-divergence) and the final value of the derivative.
                if (activeData.Count > 0)
                {
                    for (int c = 0; c < numClasses; c++)
                    {
                        modelDist[c] /= activeData.Count;
                    }
                    SmoothDistribution(modelDist);
                    for (int c_1 = 0; c_1 < numClasses; c_1++)
                    {
                        value += -geFeature2EmpiricalDist[n][c_1] * Math.Log(modelDist[c_1]);
                    }
                    for (int f = 0; f < labeledDataset.FeatureIndex().Size(); f++)
                    {
                        for (int c_2 = 0; c_2 < numClasses; c_2++)
                        {
                            int wtIndex = IndexOf(f, c_2);
                            for (int cPrime = 0; cPrime < numClasses; cPrime++)
                            {
                                derivative[wtIndex] += feature2classPairDerivatives.GetCount(new Triple <int, int, int>(f, c_2, cPrime)) * geFeature2EmpiricalDist[n][cPrime] / modelDist[cPrime];
                            }
                            derivative[wtIndex] /= activeData.Count;
                        }
                    }
                }
            }
        }
        private void PrintResultsInternal(PrintWriter pw, ICounter <Pair <string, string> > results, ClassicCounter <string> labelCount)
        {
            ClassicCounter <string> correct         = new ClassicCounter <string>();
            ClassicCounter <string> predictionCount = new ClassicCounter <string>();
            bool countGoldLabels = false;

            if (labelCount == null)
            {
                labelCount      = new ClassicCounter <string>();
                countGoldLabels = true;
            }
            foreach (Pair <string, string> predictedActual in results.KeySet())
            {
                string predicted = predictedActual.first;
                string actual    = predictedActual.second;
                if (predicted.Equals(actual))
                {
                    correct.IncrementCount(actual, results.GetCount(predictedActual));
                }
                predictionCount.IncrementCount(predicted, results.GetCount(predictedActual));
                if (countGoldLabels)
                {
                    labelCount.IncrementCount(actual, results.GetCount(predictedActual));
                }
            }
            DecimalFormat formatter = new DecimalFormat();

            formatter.SetMaximumFractionDigits(1);
            formatter.SetMinimumFractionDigits(1);
            double totalCount     = 0;
            double totalCorrect   = 0;
            double totalPredicted = 0;

            pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF");
            IList <string> labels = new List <string>(labelCount.KeySet());

            labels.Sort();
            foreach (string label in labels)
            {
                double numcorrect = correct.GetCount(label);
                double predicted  = predictionCount.GetCount(label);
                double trueCount  = labelCount.GetCount(label);
                double precision  = (predicted > 0) ? (numcorrect / predicted) : 0;
                double recall     = numcorrect / trueCount;
                double f          = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0;
                pw.Println(StringUtils.PadOrTrim(label, MaxLabelLength) + "\t" + numcorrect + "\t" + predicted + "\t" + trueCount + "\t" + formatter.Format(precision * 100) + "\t" + formatter.Format(100 * recall) + "\t" + formatter.Format(100 * f));
                if (!RelationMention.IsUnrelatedLabel(label))
                {
                    totalCount     += trueCount;
                    totalCorrect   += numcorrect;
                    totalPredicted += predicted;
                }
            }
            double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0;
            double recall_1    = totalCorrect / totalCount;
            double f_1         = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0;

            pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + formatter.Format(100 * precision_1) + "\t" + formatter.Format(100 * recall_1) + "\t" + formatter.Format(100 * f_1));
        }
        // Does L1 or L2 using FOBOS and lazy update, so L1 should not be handled in the
        // objective
        // Alternatively, you can handle other regularization in the objective,
        // but then, if the derivative is not sparse, this routine would not be very
        // efficient. However, might still be okay for CRFs
        public virtual ICounter <K> Minimize(F function, ICounter <K> x, int maxIterations)
        {
            Sayln("       Batch size of: " + batchSize);
            Sayln("       Data dimension of: " + function.DataSize());
            int numBatches = (function.DataSize() - 1) / this.batchSize + 1;

            Sayln("       Batches per pass through data:  " + numBatches);
            Sayln("       Number of passes is = " + numPasses);
            Sayln("       Max iterations is = " + maxIterations);
            ICounter <K> lastUpdated = new ClassicCounter <K>();
            int          timeStep    = 0;
            Timing       total       = new Timing();

            total.Start();
            for (int iter = 0; iter < numPasses; iter++)
            {
                double totalObjValue = 0;
                for (int j = 0; j < numBatches; j++)
                {
                    int[] selectedData = GetSample(function, this.batchSize);
                    // the core adagrad
                    ICounter <K> gradient = function.DerivativeAt(x, selectedData);
                    totalObjValue = totalObjValue + function.ValueAt(x, selectedData);
                    foreach (K feature in gradient.KeySet())
                    {
                        double gradf              = gradient.GetCount(feature);
                        double prevrate           = eta / (Math.Sqrt(sumGradSquare.GetCount(feature)) + soften);
                        double sgsValue           = sumGradSquare.IncrementCount(feature, gradf * gradf);
                        double currentrate        = eta / (Math.Sqrt(sgsValue) + soften);
                        double testupdate         = x.GetCount(feature) - (currentrate * gradient.GetCount(feature));
                        double lastUpdateTimeStep = lastUpdated.GetCount(feature);
                        double idleinterval       = timeStep - lastUpdateTimeStep - 1;
                        lastUpdated.SetCount(feature, (double)timeStep);
                        // does lazy update using idleinterval
                        double trunc      = Math.Max(0.0, (Math.Abs(testupdate) - (currentrate + prevrate * idleinterval) * this.lambdaL1));
                        double trunc2     = trunc * Math.Pow(1 - this.lambdaL2, currentrate + prevrate * idleinterval);
                        double realupdate = Math.Signum(testupdate) * trunc2;
                        if (realupdate < Eps)
                        {
                            x.Remove(feature);
                        }
                        else
                        {
                            x.SetCount(feature, realupdate);
                        }
                        // reporting
                        timeStep++;
                        if (timeStep > maxIterations)
                        {
                            Sayln("Stochastic Optimization complete.  Stopped after max iterations");
                            break;
                        }
                        Sayln(System.Console.Out.Format("Iter %d \t batch: %d \t time=%.2f \t obj=%.4f", iter, timeStep, total.Report() / 1000.0, totalObjValue).ToString());
                    }
                }
            }
            return(x);
        }
Пример #16
0
            private static void Display <T>(ClassicCounter <T> c, PrintWriter pw)
            {
                IList <T> cats = new List <T>(c.KeySet());

                cats.Sort(Counters.ToComparatorDescending(c));
                foreach (T ob in cats)
                {
                    pw.Println(ob + " " + c.GetCount(ob));
                }
            }
        public virtual void RunCoref(Document document)
        {
            Compressor <string> compressor = new Compressor <string>();

            if (Thread.Interrupted())
            {
                // Allow interrupting
                throw new RuntimeInterruptedException();
            }
            IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>();

            foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch))
            {
                foreach (int m1 in e.Value)
                {
                    pairs[new Pair <int, int>(m1, e.Key)] = true;
                }
            }
            DocumentExamples            examples       = extractor.Extract(0, document, pairs, compressor);
            ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >();

            foreach (Example mentionPair in examples.examples)
            {
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor));
            }
            IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet());

            mentionPairs.Sort(null);
            ICollection <int> seenAnaphors = new HashSet <int>();

            foreach (Pair <int, int> pair in mentionPairs)
            {
                if (seenAnaphors.Contains(pair.second))
                {
                    continue;
                }
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                seenAnaphors.Add(pair.second);
                Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType;
                Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType;
                if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)])
                {
                    CorefUtils.MergeCoreferenceClusters(pair, document);
                }
            }
        }
        public virtual void DumpStats()
        {
            System.Console.Out.WriteLine("%% Counts of nonterminals:");
            IList <string> biggestCounts = new List <string>(nonTerms.KeySet());

            biggestCounts.Sort(Counters.ToComparatorDescending(nonTerms));
            foreach (string str in biggestCounts)
            {
                System.Console.Out.WriteLine(str + ": " + nonTerms.GetCount(str));
            }
        }
        public override IDependencyGrammar FormResult()
        {
            wordIndex.AddToIndex(LexiconConstants.UnknownWord);
            MLEDependencyGrammar dg = new MLEDependencyGrammar(tlpParams, directional, useDistance, useCoarseDistance, basicCategoryTagsInDependencyGrammar, op, wordIndex, tagIndex);

            foreach (IntDependency dependency in dependencyCounter.KeySet())
            {
                dg.AddRule(dependency, dependencyCounter.GetCount(dependency));
            }
            return(dg);
        }
        public static TransducerGraph CreateGraphFromPaths <T>(ClassicCounter <IList <T> > pathCounter, int markovOrder)
        {
            TransducerGraph graph = new TransducerGraph();

            // empty
            foreach (IList <T> path in pathCounter.KeySet())
            {
                double count = pathCounter.GetCount(path);
                AddOnePathToGraph(path, count, markovOrder, graph);
            }
            return(graph);
        }
        /// <summary>
        /// Return the probability (as a real number between 0 and 1) of stopping
        /// rather than generating another argument at this position.
        /// </summary>
        /// <param name="dependency">
        /// The dependency used as the basis for stopping on.
        /// Tags are assumed to be in the TagProjection space.
        /// </param>
        /// <returns>The probability of generating this stop probability</returns>
        protected internal virtual double GetStopProb(IntDependency dependency)
        {
            short         binDistance  = DistanceBin(dependency.distance);
            IntTaggedWord unknownHead  = new IntTaggedWord(-1, dependency.head.tag);
            IntTaggedWord anyHead      = new IntTaggedWord(IntTaggedWord.AnyWordInt, dependency.head.tag);
            IntDependency temp         = new IntDependency(dependency.head, stopTW, dependency.leftHeaded, binDistance);
            double        c_stop_hTWds = stopCounter.GetCount(temp);

            temp = new IntDependency(unknownHead, stopTW, dependency.leftHeaded, binDistance);
            double c_stop_hTds = stopCounter.GetCount(temp);

            temp = new IntDependency(dependency.head, wildTW, dependency.leftHeaded, binDistance);
            double c_hTWds = stopCounter.GetCount(temp);

            temp = new IntDependency(anyHead, wildTW, dependency.leftHeaded, binDistance);
            double c_hTds        = stopCounter.GetCount(temp);
            double p_stop_hTds   = (c_hTds > 0.0 ? c_stop_hTds / c_hTds : 1.0);
            double pb_stop_hTWds = (c_stop_hTWds + smooth_stop * p_stop_hTds) / (c_hTWds + smooth_stop);

            return(pb_stop_hTWds);
        }
        /// <summary>Need to sort the counter by feature keys and dump it</summary>
        public static void PrintSVMLightFormat(PrintWriter pw, ClassicCounter <int> c, int classNo)
        {
            int[] features = Sharpen.Collections.ToArray(c.KeySet(), new int[c.KeySet().Count]);
            Arrays.Sort(features);
            StringBuilder sb = new StringBuilder();

            sb.Append(classNo);
            sb.Append(' ');
            foreach (int f in features)
            {
                sb.Append(f + 1).Append(':').Append(c.GetCount(f)).Append(' ');
            }
            pw.Println(sb.ToString());
        }
Пример #23
0
        /// <summary>A utility to get useful information out of a CorefMention.</summary>
        /// <remarks>
        /// A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are
        /// associated with this mention, and it returns a score for how much we think this mention should be the canonical
        /// mention.
        /// </remarks>
        /// <param name="doc">The document this mention is referenced into.</param>
        /// <param name="mention">The mention itself.</param>
        /// <returns>A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention.</returns>
        private static Pair <IList <CoreLabel>, double> GrokCorefMention(Annotation doc, CorefChain.CorefMention mention)
        {
            IList <CoreLabel> tokens          = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <CoreLabel> mentionAsTokens = tokens.SubList(mention.startIndex - 1, mention.endIndex - 1);
            // Try to assess this mention's NER type
            ICounter <string> nerVotes = new ClassicCounter <string>();

            mentionAsTokens.Stream().Filter(null).ForEach(null);
            string ner      = Counters.Argmax(nerVotes, null);
            double nerCount = nerVotes.GetCount(ner);
            double nerScore = nerCount * nerCount / ((double)mentionAsTokens.Count);

            // Return
            return(Pair.MakePair(mentionAsTokens, nerScore));
        }
Пример #24
0
        /// <summary>
        /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter
        /// using the actual features and labels.
        /// </summary>
        /// <remarks>
        /// Converts the svm_light weight Counter (which uses feature indices) into a weight Counter
        /// using the actual features and labels.  Because this is svm_light, and not svm_struct, the
        /// weights for the +1 class (which correspond to labelIndex.get(0)) and the -1 class
        /// (which correspond to labelIndex.get(1)) are just the negation of one another.
        /// </remarks>
        private ClassicCounter <Pair <F, L> > ConvertSVMLightWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex)
        {
            ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >();

            foreach (int i in weights.KeySet())
            {
                F      f = featureIndex.Get(i - 1);
                double w = weights.GetCount(i);
                // the first guy in the labelIndex was the +1 class and the second guy
                // was the -1 class
                newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(0)), w);
                newWeights.IncrementCount(new Pair <F, L>(f, labelIndex.Get(1)), -w);
            }
            return(newWeights);
        }
Пример #25
0
        /// <summary>
        /// Converts the svm_struct weight Counter (in which the weight for a feature/label pair
        /// correspondes to ((labelIndex * numFeatures)+(featureIndex+1))) into a weight Counter
        /// using the actual features and labels.
        /// </summary>
        private ClassicCounter <Pair <F, L> > ConvertSVMStructWeights(ClassicCounter <int> weights, IIndex <F> featureIndex, IIndex <L> labelIndex)
        {
            // int numLabels = labelIndex.size();
            int numFeatures = featureIndex.Size();
            ClassicCounter <Pair <F, L> > newWeights = new ClassicCounter <Pair <F, L> >();

            foreach (int i in weights.KeySet())
            {
                L l = labelIndex.Get((i - 1) / numFeatures);
                // integer division on purpose
                F      f = featureIndex.Get((i - 1) % numFeatures);
                double w = weights.GetCount(i);
                newWeights.IncrementCount(new Pair <F, L>(f, l), w);
            }
            return(newWeights);
        }
Пример #26
0
            private static void Display <T>(ClassicCounter <T> c, int num, PrintWriter pw)
            {
                IList <T> rules = new List <T>(c.KeySet());

                rules.Sort(Counters.ToComparatorDescending(c));
                int rSize = rules.Count;

                if (num > rSize)
                {
                    num = rSize;
                }
                for (int i = 0; i < num; i++)
                {
                    pw.Println(rules[i] + " " + c.GetCount(rules[i]));
                }
            }
        /// <summary>Trains the first-character based unknown word model.</summary>
        /// <param name="tw">The word we are currently training on</param>
        /// <param name="loc">The position of that word</param>
        /// <param name="weight">The weight to give this word in terms of training</param>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            string word  = tw.Word();
            ILabel tagL  = new Tag(tw.Tag());
            string first = Sharpen.Runtime.Substring(word, 0, 1);

            if (useUnicodeType)
            {
                char ch   = word[0];
                int  type = char.GetType(ch);
                if (type != char.OtherLetter)
                {
                    // standard Chinese characters are of type "OTHER_LETTER"!!
                    first = int.ToString(type);
                }
            }
            string tag = tw.Tag();

            if (!c.Contains(tagL))
            {
                c[tagL] = new ClassicCounter <string>();
            }
            c[tagL].IncrementCount(first, weight);
            tc.IncrementCount(tagL, weight);
            seenFirst.Add(first);
            IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iTotal, weight);
                }
            }
        }
Пример #28
0
        public virtual void RunCoref(Document document)
        {
            IList <Mention> sortedMentions = CorefUtils.GetSortedMentions(document);
            IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >();

            foreach (Mention m in sortedMentions)
            {
                IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m.headIndex, null);
                withIndex.Add(m);
            }
            SimpleMatrix documentEmbedding = embeddingExtractor.GetDocumentEmbedding(document);
            IDictionary <int, SimpleMatrix> antecedentEmbeddings = new Dictionary <int, SimpleMatrix>();
            IDictionary <int, SimpleMatrix> anaphorEmbeddings    = new Dictionary <int, SimpleMatrix>();
            ICounter <int> anaphoricityScores = new ClassicCounter <int>();

            foreach (Mention m_1 in sortedMentions)
            {
                SimpleMatrix mentionEmbedding = embeddingExtractor.GetMentionEmbeddings(m_1, documentEmbedding);
                antecedentEmbeddings[m_1.mentionID] = model.GetAntecedentEmbedding(mentionEmbedding);
                anaphorEmbeddings[m_1.mentionID]    = model.GetAnaphorEmbedding(mentionEmbedding);
                anaphoricityScores.IncrementCount(m_1.mentionID, model.GetAnaphoricityScore(mentionEmbedding, featureExtractor.GetAnaphoricityFeatures(m_1, document, mentionsByHeadIndex)));
            }
            IDictionary <int, IList <int> > mentionToCandidateAntecedents = CorefUtils.HeuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch);

            foreach (KeyValuePair <int, IList <int> > e in mentionToCandidateAntecedents)
            {
                double bestScore  = anaphoricityScores.GetCount(e.Key) - 50 * (greedyness - 0.5);
                int    m_2        = e.Key;
                int    antecedent = null;
                foreach (int ca in e.Value)
                {
                    double score = model.GetPairwiseScore(antecedentEmbeddings[ca], anaphorEmbeddings[m_2], featureExtractor.GetPairFeatures(new Pair <int, int>(ca, m_2), document, mentionsByHeadIndex));
                    if (score > bestScore)
                    {
                        bestScore  = score;
                        antecedent = ca;
                    }
                }
                if (antecedent != null)
                {
                    CorefUtils.MergeCoreferenceClusters(new Pair <int, int>(antecedent, m_2), document);
                }
            }
        }
        private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, IList <Pair <int, int> > mentionPairs, ICounter <Pair <int, int> > scores)
        {
            ICounter <string> features  = new ClassicCounter <string>();
            double            maxScore  = 0;
            double            minScore  = 1;
            ICounter <string> totals    = new ClassicCounter <string>();
            ICounter <string> totalsLog = new ClassicCounter <string>();
            ICounter <string> counts    = new ClassicCounter <string>();

            foreach (Pair <int, int> mentionPair in mentionPairs)
            {
                if (!scores.ContainsKey(mentionPair))
                {
                    mentionPair = new Pair <int, int>(mentionPair.second, mentionPair.first);
                }
                double score    = scores.GetCount(mentionPair);
                double logScore = CappedLog(score);
                string mt1      = doc.mentionTypes[mentionPair.first];
                string mt2      = doc.mentionTypes[mentionPair.second];
                mt1 = mt1.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL";
                mt2 = mt2.Equals("PRONOMINAL") ? "PRONOMINAL" : "NON_PRONOMINAL";
                string conj = "_" + mt1 + "_" + mt2;
                maxScore = Math.Max(maxScore, score);
                minScore = Math.Min(minScore, score);
                totals.IncrementCount(string.Empty, score);
                totalsLog.IncrementCount(string.Empty, logScore);
                counts.IncrementCount(string.Empty);
                totals.IncrementCount(conj, score);
                totalsLog.IncrementCount(conj, logScore);
                counts.IncrementCount(conj);
            }
            features.IncrementCount("max", maxScore);
            features.IncrementCount("min", minScore);
            foreach (string key in counts.KeySet())
            {
                features.IncrementCount("avg" + key, totals.GetCount(key) / mentionPairs.Count);
                features.IncrementCount("avgLog" + key, totalsLog.GetCount(key) / mentionPairs.Count);
            }
            return(features);
        }
Пример #30
0
        // todo: Fix javadoc, have unit tested
        /// <summary>Print SVM Light Format file.</summary>
        /// <remarks>
        /// Print SVM Light Format file.
        /// The following comments are no longer applicable because I am
        /// now printing out the exact labelID for each example. -Ramesh ([email protected]) 12/17/2009.
        /// If the Dataset has more than 2 classes, then it
        /// prints using the label index (+1) (for svm_struct).  If it is 2 classes, then the labelIndex.get(0)
        /// is mapped to +1 and labelIndex.get(1) is mapped to -1 (for svm_light).
        /// </remarks>
        public virtual void PrintSVMLightFormat(PrintWriter pw)
        {
            //assumes each data item has a few features on, and sorts the feature keys while collecting the values in a counter
            // old comment:
            // the following code commented out by Ramesh ([email protected]) 12/17/2009.
            // why not simply print the exact id of the label instead of mapping to some values??
            // new comment:
            // mihai: we NEED this, because svm_light has special conventions not supported by default by our labels,
            //        e.g., in a multiclass setting it assumes that labels start at 1 whereas our labels start at 0 (08/31/2010)
            string[] labelMap = MakeSvmLabelMap();
            for (int i = 0; i < size; i++)
            {
                RVFDatum <L, F>      d      = GetRVFDatum(i);
                ICounter <F>         c      = d.AsFeaturesCounter();
                ClassicCounter <int> printC = new ClassicCounter <int>();
                foreach (F f in c.KeySet())
                {
                    printC.SetCount(featureIndex.IndexOf(f), c.GetCount(f));
                }
                int[] features = Sharpen.Collections.ToArray(printC.KeySet(), new int[printC.KeySet().Count]);
                Arrays.Sort(features);
                StringBuilder sb = new StringBuilder();
                sb.Append(labelMap[labels[i]]).Append(' ');
                // sb.append(labels[i]).append(' '); // commented out by mihai: labels[i] breaks svm_light conventions!

                /* Old code: assumes that F is Integer....
                 *
                 * for (int f: features) {
                 * sb.append((f + 1)).append(":").append(c.getCount(f)).append(" ");
                 * }
                 */
                //I think this is what was meant (using printC rather than c), but not sure
                // ~Sarah Spikes ([email protected])
                foreach (int f_1 in features)
                {
                    sb.Append((f_1 + 1)).Append(':').Append(printC.GetCount(f_1)).Append(' ');
                }
                pw.Println(sb.ToString());
            }
        }