Example #1
0
        /// <summary>
        /// Trains a batch of trees and returns the following: a list of
        /// Update objects, the number of transitions correct, and the number
        /// of transitions wrong.
        /// </summary>
        /// <remarks>
        /// Trains a batch of trees and returns the following: a list of
        /// Update objects, the number of transitions correct, and the number
        /// of transitions wrong.
        /// If the model is trained with multiple threads, it is expected
        /// that a valid MulticoreWrapper is passed in which does the
        /// processing.  In that case, the processing is done on all of the
        /// trees without updating any weights, which allows the results for
        /// multithreaded training to be reproduced.
        /// </remarks>
        private Triple <IList <PerceptronModel.Update>, int, int> TrainBatch(IList <int> indices, IList <Tree> binarizedTrees, IList <IList <ITransition> > transitionLists, IList <PerceptronModel.Update> updates, Oracle oracle, MulticoreWrapper <int, Pair <int
                                                                                                                                                                                                                                                                 , int> > wrapper)
        {
            int numCorrect = 0;
            int numWrong   = 0;

            if (op.trainOptions.trainingThreads == 1)
            {
                foreach (int index in indices)
                {
                    Pair <int, int> count = TrainTree(index, binarizedTrees, transitionLists, updates, oracle);
                    numCorrect += count.first;
                    numWrong   += count.second;
                }
            }
            else
            {
                foreach (int index in indices)
                {
                    wrapper.Put(index);
                }
                wrapper.Join(false);
                while (wrapper.Peek())
                {
                    Pair <int, int> result = wrapper.Poll();
                    numCorrect += result.first;
                    numWrong   += result.second;
                }
            }
            return(new Triple <IList <PerceptronModel.Update>, int, int>(updates, numCorrect, numWrong));
        }
 public virtual void Annotate(Annotation annotation)
 {
     // turn the annotation into a sentence
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         if (nThreads == 1)
         {
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 DoOneSentence(sentence);
             }
         }
         else
         {
             MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this));
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     wrapper.Poll();
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 wrapper.Poll();
             }
         }
     }
     else
     {
         throw new Exception("unable to find words/tokens in: " + annotation);
     }
 }
        /// <summary>
        /// Fix tree structure, phrasal categories and part-of-speech labels in newly expanded
        /// multi-word tokens.
        /// </summary>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        private IList <Tree> FixMultiWordTokens(IList <Tree> trees)
        {
            bool ner = PropertiesUtils.GetBool(options, "ner", false);
            // Shared resources
            IFactory <TreeNormalizer> tnf = new _IFactory_389();
            ITreeFactory tf = new LabeledScoredTreeFactory();
            IThreadsafeProcessor <ICollection <Tree>, ICollection <Tree> > processor = new AnCoraProcessor.MultiWordProcessor(this, tnf, tf, ner);
            int availableProcessors = Runtime.GetRuntime().AvailableProcessors();
            MulticoreWrapper <ICollection <Tree>, ICollection <Tree> > wrapper = new MulticoreWrapper <ICollection <Tree>, ICollection <Tree> >(availableProcessors, processor, false);
            // Chunk our work so that parallelization is actually worth it
            int numChunks = availableProcessors * 20;
            IList <IList <Tree> > chunked = CollectionUtils.PartitionIntoFolds(trees, numChunks);
            IList <Tree>          ret     = new List <Tree>();

            foreach (ICollection <Tree> coll in chunked)
            {
                wrapper.Put(coll);
                while (wrapper.Peek())
                {
                    Sharpen.Collections.AddAll(ret, wrapper.Poll());
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Sharpen.Collections.AddAll(ret, wrapper.Poll());
            }
            return(ret);
        }
        /// <summary>
        /// Determine the total cost on the dataset associated with this
        /// classifier using the current learned parameters.
        /// </summary>
        /// <remarks>
        /// Determine the total cost on the dataset associated with this
        /// classifier using the current learned parameters. This cost is
        /// evaluated using mini-batch adaptive gradient descent.
        /// This method launches multiple threads, each of which evaluates
        /// training cost on a partition of the mini-batch.
        /// </remarks>
        /// <param name="batchSize"/>
        /// <param name="regParameter">Regularization parameter (lambda)</param>
        /// <param name="dropOutProb">
        /// Drop-out probability. Hidden-layer units in the
        /// neural network will be randomly turned off
        /// while training a particular example with this
        /// probability.
        /// </param>
        /// <returns>
        /// A
        /// <see cref="Cost"/>
        /// object which describes the total cost of the given
        /// weights, and includes gradients to be used for further
        /// training
        /// </returns>
        public virtual Classifier.Cost ComputeCostFunction(int batchSize, double regParameter, double dropOutProb)
        {
            ValidateTraining();
            IList <Example> examples = Edu.Stanford.Nlp.Parser.Nndep.Util.GetRandomSubList(dataset.examples, batchSize);
            // Redo precomputations for only those features which are triggered
            // by examples in this mini-batch.
            ICollection <int> toPreCompute = GetToPreCompute(examples);

            PreCompute(toPreCompute);
            // Set up parameters for feedforward
            Classifier.FeedforwardParams @params = new Classifier.FeedforwardParams(batchSize, dropOutProb);
            // Zero out saved-embedding gradients
            gradSaved = new double[][] {  };
            int numChunks = config.trainingThreads;
            IList <IList <Example> > chunks = CollectionUtils.PartitionIntoFolds(examples, numChunks);

            // Submit chunks for processing on separate threads
            foreach (ICollection <Example> chunk in chunks)
            {
                jobHandler.Put(new Pair <ICollection <Example>, Classifier.FeedforwardParams>(chunk, @params));
            }
            jobHandler.Join(false);
            // Join costs from each chunk
            Classifier.Cost cost = null;
            while (jobHandler.Peek())
            {
                Classifier.Cost otherCost = jobHandler.Poll();
                if (cost == null)
                {
                    cost = otherCost;
                }
                else
                {
                    cost.Merge(otherCost);
                }
            }
            if (cost == null)
            {
                return(null);
            }
            // Backpropagate gradients on saved pre-computed values to actual
            // embeddings
            cost.BackpropSaved(toPreCompute);
            cost.AddL2Regularization(regParameter);
            return(cost);
        }
        protected internal override void Calculate(double[] theta)
        {
            model.VectorToParams(theta);
            SentimentCostAndGradient.ModelDerivatives derivatives;
            if (model.op.trainOptions.nThreads == 1)
            {
                derivatives = ScoreDerivatives(trainingBatch);
            }
            else
            {
                // TODO: because some addition operations happen in different
                // orders now, this results in slightly different values, which
                // over time add up to significantly different models even when
                // given the same random seed.  Probably not a big deal.
                // To be more specific, for trees T1, T2, T3, ... Tn,
                // when using one thread, we sum the derivatives T1 + T2 ...
                // When using multiple threads, we first sum T1 + ... + Tk,
                // then sum Tk+1 + ... + T2k, etc, for split size k.
                // The splits are then summed in order.
                // This different sum order results in slightly different numbers.
                MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives> wrapper = new MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives>(model.op.trainOptions.nThreads, new SentimentCostAndGradient.ScoringProcessor(this
                                                                                                                                                                                                                                                                  ));
                // use wrapper.nThreads in case the number of threads was automatically changed
                foreach (IList <Tree> chunk in CollectionUtils.PartitionIntoFolds(trainingBatch, wrapper.NThreads()))
                {
                    wrapper.Put(chunk);
                }
                wrapper.Join();
                derivatives = new SentimentCostAndGradient.ModelDerivatives(model);
                while (wrapper.Peek())
                {
                    SentimentCostAndGradient.ModelDerivatives batchDerivatives = wrapper.Poll();
                    derivatives.Add(batchDerivatives);
                }
            }
            // scale the error by the number of sentences so that the
            // regularization isn't drowned out for large training batchs
            double scale = (1.0 / trainingBatch.Count);

            value      = derivatives.error * scale;
            value     += ScaleAndRegularize(derivatives.binaryTD, model.binaryTransform, scale, model.op.trainOptions.regTransformMatrix, false);
            value     += ScaleAndRegularize(derivatives.binaryCD, model.binaryClassification, scale, model.op.trainOptions.regClassification, true);
            value     += ScaleAndRegularizeTensor(derivatives.binaryTensorTD, model.binaryTensors, scale, model.op.trainOptions.regTransformTensor);
            value     += ScaleAndRegularize(derivatives.unaryCD, model.unaryClassification, scale, model.op.trainOptions.regClassification, false, true);
            value     += ScaleAndRegularize(derivatives.wordVectorD, model.wordVectors, scale, model.op.trainOptions.regWordVector, true, false);
            derivative = NeuralUtils.ParamsToVector(theta.Length, derivatives.binaryTD.ValueIterator(), derivatives.binaryCD.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(derivatives.binaryTensorTD.ValueIterator()), derivatives.unaryCD.Values.GetEnumerator
                                                        (), derivatives.wordVectorD.Values.GetEnumerator());
        }
Example #6
0
 public static void RedoTags(IList <Tree> trees, Edu.Stanford.Nlp.Tagger.Common.Tagger tagger, int nThreads)
 {
     if (nThreads == 1)
     {
         foreach (Tree tree in trees)
         {
             RedoTags(tree, tagger);
         }
     }
     else
     {
         MulticoreWrapper <Tree, Tree> wrapper = new MulticoreWrapper <Tree, Tree>(nThreads, new ShiftReduceParser.RetagProcessor(tagger));
         foreach (Tree tree in trees)
         {
             wrapper.Put(tree);
         }
         wrapper.Join();
     }
 }
Example #7
0
        public static IdentityHashMap <Tree, IList <Tree> > ConvertToTrees(ICollection <Tree> keys, IdentityHashMap <Tree, byte[]> compressed, int numThreads)
        {
            IdentityHashMap <Tree, IList <Tree> >    uncompressed = Generics.NewIdentityHashMap();
            MulticoreWrapper <byte[], IList <Tree> > wrapper      = new MulticoreWrapper <byte[], IList <Tree> >(numThreads, new CacheParseHypotheses.DecompressionProcessor());

            foreach (Tree tree in keys)
            {
                wrapper.Put(compressed[tree]);
            }
            foreach (Tree tree_1 in keys)
            {
                if (!wrapper.Peek())
                {
                    wrapper.Join();
                }
                uncompressed[tree_1] = wrapper.Poll();
            }
            return(uncompressed);
        }
Example #8
0
        /// <summary>Segment input and write to output stream.</summary>
        /// <param name="segmenter"/>
        /// <param name="br"/>
        /// <param name="pwOut"/>
        /// <param name="nThreads"/>
        /// <returns>input characters processed per second</returns>
        private static double Decode(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads)
        {
            System.Diagnostics.Debug.Assert(nThreads > 0);
            long nChars    = 0;
            long startTime = Runtime.NanoTime();

            if (nThreads > 1)
            {
                MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, segmenter);
                try
                {
                    for (string line; (line = br.ReadLine()) != null;)
                    {
                        nChars += line.Length;
                        wrapper.Put(line);
                        while (wrapper.Peek())
                        {
                            pwOut.Println(wrapper.Poll());
                        }
                    }
                    wrapper.Join();
                    while (wrapper.Peek())
                    {
                        pwOut.Println(wrapper.Poll());
                    }
                }
                catch (IOException e)
                {
                    log.Warn(e);
                }
            }
            else
            {
                nChars = segmenter.Segment(br, pwOut);
            }
            long   duration    = Runtime.NanoTime() - startTime;
            double charsPerSec = (double)nChars / (duration / 1000000000.0);

            return(charsPerSec);
        }
Example #9
0
 // static main
 /// <param name="args">Command-line arguments: modelFile (runs as a filter from stdin to stdout)</param>
 public static void Main(string[] args)
 {
     if (args.Length != 1)
     {
         System.Console.Error.Printf("Usage: java %s model_file < input_file%n", typeof(Edu.Stanford.Nlp.Tagger.Maxent.Documentation.MulticoreWrapperDemo).FullName);
         System.Environment.Exit(-1);
     }
     try
     {
         // Load MaxentTagger, which is threadsafe
         string       modelFile = args[0];
         MaxentTagger tagger    = new MaxentTagger(modelFile);
         // Configure to run with 4 worker threads
         int nThreads = 4;
         MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, new _IThreadsafeProcessor_42(tagger));
         // MaxentTagger is threadsafe
         // Submit jobs, which come from stdin
         BufferedReader br = new BufferedReader(new InputStreamReader(Runtime.@in));
         for (string line; (line = br.ReadLine()) != null;)
         {
             wrapper.Put(line);
             while (wrapper.Peek())
             {
                 System.Console.Out.WriteLine(wrapper.Poll());
             }
         }
         // Finished reading the input. Wait for jobs to finish
         wrapper.Join();
         while (wrapper.Peek())
         {
             System.Console.Out.WriteLine(wrapper.Poll());
         }
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Example #10
0
        /// <summary>
        /// Samples the complete sequence once in the forward direction
        /// Destructively modifies the sequence in place.
        /// </summary>
        /// <param name="sequence">the sequence to start with.</param>
        public virtual double SampleSequenceForward(ISequenceModel model, int[] sequence, double temperature, ICollection <int> onlySampleThesePositions)
        {
            double returnScore = double.NegativeInfinity;

            // log.info("Sampling forward");
            if (onlySampleThesePositions != null)
            {
                foreach (int pos in onlySampleThesePositions)
                {
                    returnScore = SamplePosition(model, sequence, pos, temperature);
                }
            }
            else
            {
                if (samplingStyle == SequentialSampling)
                {
                    for (int pos = 0; pos < sequence.Length; pos++)
                    {
                        returnScore = SamplePosition(model, sequence, pos, temperature);
                    }
                }
                else
                {
                    if (samplingStyle == RandomSampling)
                    {
                        foreach (int aSequence in sequence)
                        {
                            int pos = random.NextInt(sequence.Length);
                            returnScore = SamplePosition(model, sequence, pos, temperature);
                        }
                    }
                    else
                    {
                        if (samplingStyle == ChromaticSampling)
                        {
                            // make copies of the sequences and merge at the end
                            IList <Pair <int, int> > results = new List <Pair <int, int> >();
                            foreach (IList <int> indieList in partition)
                            {
                                if (indieList.Count <= chromaticSize)
                                {
                                    foreach (int pos in indieList)
                                    {
                                        Pair <int, double> newPosProb = SamplePositionHelper(model, sequence, pos, temperature);
                                        sequence[pos] = newPosProb.First();
                                    }
                                }
                                else
                                {
                                    MulticoreWrapper <IList <int>, IList <Pair <int, int> > > wrapper = new MulticoreWrapper <IList <int>, IList <Pair <int, int> > >(chromaticSize, new _IThreadsafeProcessor_269(this, model, sequence, temperature));
                                    // returns the position to sample in first place and new label in second place
                                    results.Clear();
                                    int interval = System.Math.Max(1, indieList.Count / chromaticSize);
                                    for (int begin = 0; end < indieListSize; begin += interval)
                                    {
                                        end = System.Math.Min(begin + interval, indieListSize);
                                        wrapper.Put(indieList.SubList(begin, end));
                                        while (wrapper.Peek())
                                        {
                                            Sharpen.Collections.AddAll(results, wrapper.Poll());
                                        }
                                    }
                                    wrapper.Join();
                                    while (wrapper.Peek())
                                    {
                                        Sharpen.Collections.AddAll(results, wrapper.Poll());
                                    }
                                    foreach (Pair <int, int> posVal in results)
                                    {
                                        sequence[posVal.First()] = posVal.Second();
                                    }
                                }
                            }
                            returnScore = model.ScoreOf(sequence);
                        }
                    }
                }
            }
            return(returnScore);
        }
        /// <summary>Test the parser on a treebank.</summary>
        /// <remarks>
        /// Test the parser on a treebank. Parses will be written to stdout, and
        /// various other information will be written to stderr and stdout,
        /// particularly if <code>op.testOptions.verbose</code> is true.
        /// </remarks>
        /// <param name="testTreebank">The treebank to parse</param>
        /// <returns>
        /// The labeled precision/recall F<sub>1</sub> (EVALB measure)
        /// of the parser on the treebank.
        /// </returns>
        public virtual double TestOnTreebank(Treebank testTreebank)
        {
            log.Info("Testing on treebank");
            Timing    treebankTotalTimer        = new Timing();
            TreePrint treePrint                 = op.testOptions.TreePrint(op.tlpParams);
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = op.Langpack();
            PrintWriter pwOut;
            PrintWriter pwErr;

            if (op.testOptions.quietEvaluation)
            {
                NullOutputStream quiet = new NullOutputStream();
                pwOut = tlpParams.Pw(quiet);
                pwErr = tlpParams.Pw(quiet);
            }
            else
            {
                pwOut = tlpParams.Pw();
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (op.testOptions.verbose)
            {
                pwErr.Print("Testing ");
                pwErr.Println(testTreebank.TextualSummary(tlp));
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(tlpParams);
            }
            PrintWriter pwFileOut = null;

            if (op.testOptions.writeOutputFiles)
            {
                string fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
                try
                {
                    pwFileOut = op.tlpParams.Pw(new FileOutputStream(fname));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            PrintWriter pwStats = null;

            if (op.testOptions.outputkBestEquivocation != null)
            {
                try
                {
                    pwStats = op.tlpParams.Pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            if (op.testOptions.testingThreads != 1)
            {
                MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
                LinkedList <Tree> goldTrees = new LinkedList <Tree>();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <IHasWord> sentence = GetInputSentence(goldTree);
                    goldTrees.Add(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    wrapper.Put(sentence);
                    while (wrapper.Peek())
                    {
                        IParserQuery pq = wrapper.Poll();
                        goldTree = goldTrees.Poll();
                        ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                    }
                }
                // for tree iterator
                wrapper.Join();
                while (wrapper.Peek())
                {
                    IParserQuery pq         = wrapper.Poll();
                    Tree         goldTree_1 = goldTrees.Poll();
                    ProcessResults(pq, goldTree_1, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            else
            {
                IParserQuery pq = pqFactory.ParserQuery();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <CoreLabel> sentence = GetInputSentence(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    pq.ParseAndReport(sentence, pwErr);
                    ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            // for tree iterator
            //Done parsing...print the results of the evaluations
            treebankTotalTimer.Done("Testing on treebank");
            if (op.testOptions.quietEvaluation)
            {
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (saidMemMessage)
            {
                ParserUtils.PrintOutOfMemory(pwErr);
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            if (numSkippedEvals != 0)
            {
                pwErr.Printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
            }
            // only created here so we know what parser types are supported...
            IParserQuery pq_1 = pqFactory.ParserQuery();

            if (summary)
            {
                if (pcfgLB != null)
                {
                    pcfgLB.Display(false, pwErr);
                }
                if (pcfgChildSpecific != null)
                {
                    pcfgChildSpecific.Display(false, pwErr);
                }
                if (pcfgLA != null)
                {
                    pcfgLA.Display(false, pwErr);
                }
                if (pcfgCB != null)
                {
                    pcfgCB.Display(false, pwErr);
                }
                if (pcfgDA != null)
                {
                    pcfgDA.Display(false, pwErr);
                }
                if (pcfgTA != null)
                {
                    pcfgTA.Display(false, pwErr);
                }
                if (pcfgLL != null && pq_1.GetPCFGParser() != null)
                {
                    pcfgLL.Display(false, pwErr);
                }
                if (depDA != null)
                {
                    depDA.Display(false, pwErr);
                }
                if (depTA != null)
                {
                    depTA.Display(false, pwErr);
                }
                if (depLL != null && pq_1.GetDependencyParser() != null)
                {
                    depLL.Display(false, pwErr);
                }
                if (factLB != null)
                {
                    factLB.Display(false, pwErr);
                }
                if (factChildSpecific != null)
                {
                    factChildSpecific.Display(false, pwErr);
                }
                if (factLA != null)
                {
                    factLA.Display(false, pwErr);
                }
                if (factCB != null)
                {
                    factCB.Display(false, pwErr);
                }
                if (factDA != null)
                {
                    factDA.Display(false, pwErr);
                }
                if (factTA != null)
                {
                    factTA.Display(false, pwErr);
                }
                if (factLL != null && pq_1.GetFactoredParser() != null)
                {
                    factLL.Display(false, pwErr);
                }
                if (pcfgCatE != null)
                {
                    pcfgCatE.Display(false, pwErr);
                }
                foreach (IEval eval in evals)
                {
                    eval.Display(false, pwErr);
                }
                foreach (BestOfTopKEval eval_1 in topKEvals)
                {
                    eval_1.Display(false, pwErr);
                }
            }
            // these ones only have a display mode, so display if turned on!!
            if (pcfgRUO != null)
            {
                pcfgRUO.Display(true, pwErr);
            }
            if (pcfgCUO != null)
            {
                pcfgCUO.Display(true, pwErr);
            }
            if (tsv)
            {
                NumberFormat nf = new DecimalFormat("0.00");
                pwErr.Println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && factDA != null)
                {
                    pwErr.Print(nf.Format(factDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetExactPercent()));
                }
                pwErr.Print("\t");
                if (pcfgLB != null)
                {
                    pwErr.Print(nf.Format(pcfgLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && depDA != null)
                {
                    pwErr.Print(nf.Format(depDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetPCFGParser() != null && factTA != null)
                {
                    pwErr.Print(nf.Format(factTA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(factLB.GetNum());
                }
                pwErr.Println();
            }
            double f1 = 0.0;

            if (factLB != null)
            {
                f1 = factLB.GetEvalbF1();
            }
            //Close files (if necessary)
            if (pwFileOut != null)
            {
                pwFileOut.Close();
            }
            if (pwStats != null)
            {
                pwStats.Close();
            }
            if (parserQueryEvals != null)
            {
                foreach (IParserQueryEval parserQueryEval in parserQueryEvals)
                {
                    parserQueryEval.Display(false, pwErr);
                }
            }
            return(f1);
        }
Example #12
0
        // fill value & derivative
        protected internal override void Calculate(double[] theta)
        {
            dvModel.VectorToParams(theta);
            double localValue = 0.0;

            double[] localDerivative = new double[theta.Length];
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsB;

            binaryW_dfsG = TwoDimensionalMap.TreeMap();
            binaryW_dfsB = TwoDimensionalMap.TreeMap();
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesB;

            binaryScoreDerivativesG = TwoDimensionalMap.TreeMap();
            binaryScoreDerivativesB = TwoDimensionalMap.TreeMap();
            IDictionary <string, SimpleMatrix> unaryW_dfsG;
            IDictionary <string, SimpleMatrix> unaryW_dfsB;

            unaryW_dfsG = new SortedDictionary <string, SimpleMatrix>();
            unaryW_dfsB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesG;
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesB;

            unaryScoreDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            unaryScoreDerivativesB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesB = new SortedDictionary <string, SimpleMatrix>();

            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in dvModel.binaryTransform)
            {
                int numRows = entry.GetValue().NumRows();
                int numCols = entry.GetValue().NumCols();
                binaryW_dfsG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryW_dfsB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryScoreDerivativesG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
                binaryScoreDerivativesB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
            }
            foreach (KeyValuePair <string, SimpleMatrix> entry_1 in dvModel.unaryTransform)
            {
                int numRows = entry_1.Value.NumRows();
                int numCols = entry_1.Value.NumCols();
                unaryW_dfsG[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryW_dfsB[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryScoreDerivativesG[entry_1.Key] = new SimpleMatrix(1, numRows);
                unaryScoreDerivativesB[entry_1.Key] = new SimpleMatrix(1, numRows);
            }
            if (op.trainOptions.trainWordVectors)
            {
                foreach (KeyValuePair <string, SimpleMatrix> entry_2 in dvModel.wordVectors)
                {
                    int numRows = entry_2.Value.NumRows();
                    int numCols = entry_2.Value.NumCols();
                    wordVectorDerivativesG[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                    wordVectorDerivativesB[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                }
            }
            // Some optimization methods prints out a line without an end, so our
            // debugging statements are misaligned
            Timing scoreTiming = new Timing();

            scoreTiming.Doing("Scoring trees");
            int treeNum = 0;
            MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> > wrapper = new MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> >(op.trainOptions.trainingThreads, new DVParserCostAndGradient.ScoringProcessor(this));

            foreach (Tree tree in trainingBatch)
            {
                wrapper.Put(tree);
            }
            wrapper.Join();
            scoreTiming.Done();
            while (wrapper.Peek())
            {
                Pair <DeepTree, DeepTree> result = wrapper.Poll();
                DeepTree      goldTree           = result.first;
                DeepTree      bestTree           = result.second;
                StringBuilder treeDebugLine      = new StringBuilder();
                Formatter     formatter          = new Formatter(treeDebugLine);
                bool          isDone             = (Math.Abs(bestTree.GetScore() - goldTree.GetScore()) <= 0.00001 || goldTree.GetScore() > bestTree.GetScore());
                string        done = isDone ? "done" : string.Empty;
                formatter.Format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.GetScore(), goldTree.GetScore(), done);
                log.Info(treeDebugLine.ToString());
                if (!isDone)
                {
                    // if the gold tree is better than the best hypothesis tree by
                    // a large enough margin, then the score difference will be 0
                    // and we ignore the tree
                    double valueDelta = bestTree.GetScore() - goldTree.GetScore();
                    //double valueDelta = Math.max(0.0, - scoreGold + bestScore);
                    localValue += valueDelta;
                    // get the context words for this tree - should be the same
                    // for either goldTree or bestTree
                    IList <string> words = GetContextWords(goldTree.GetTree());
                    // The derivatives affected by this tree are only based on the
                    // nodes present in this tree, eg not all matrix derivatives
                    // will be affected by this tree
                    BackpropDerivative(goldTree.GetTree(), words, goldTree.GetVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG);
                    BackpropDerivative(bestTree.GetTree(), words, bestTree.GetVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB);
                }
                ++treeNum;
            }
            double[] localDerivativeGood;
            double[] localDerivativeB;
            if (op.trainOptions.trainWordVectors)
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator(), wordVectorDerivativesG.Values
                                                                 .GetEnumerator());
                localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator(), wordVectorDerivativesB.Values
                                                              .GetEnumerator());
            }
            else
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator());
                localDerivativeB    = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator());
            }
            // correct - highest
            for (int i = 0; i < localDerivativeGood.Length; i++)
            {
                localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i];
            }
            // TODO: this is where we would combine multiple costs if we had parallelized the calculation
            value      = localValue;
            derivative = localDerivative;
            // normalizing by training batch size
            value = (1.0 / trainingBatch.Count) * value;
            ArrayMath.MultiplyInPlace(derivative, (1.0 / trainingBatch.Count));
            // add regularization to cost:
            double[] currentParams = dvModel.ParamsToVector();
            double   regCost       = 0;

            foreach (double currentParam in currentParams)
            {
                regCost += currentParam * currentParam;
            }
            regCost = op.trainOptions.regCost * 0.5 * regCost;
            value  += regCost;
            // add regularization to gradient
            ArrayMath.MultiplyInPlace(currentParams, op.trainOptions.regCost);
            ArrayMath.PairwiseAddInPlace(derivative, currentParams);
        }
        /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary>
        protected internal override void Calculate(double[] x)
        {
            double prob = 0.0;

            // the log prob of the sequence given the model, which is the negation of value at this point
            // final double[][] weights = to2D(x);
            To2D(x, weights);
            SetWeights(weights);
            // the expectations over counts
            // first index is feature index, second index is of possible labeling
            // double[][] E = empty2D();
            Clear2D(E);
            Clear2D(dropoutPriorGradTotal);
            MulticoreWrapper <Pair <int, bool>, Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > > wrapper = new MulticoreWrapper <Pair <int, bool>, Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > >
                                                                                                                                                   (multiThreadGrad, dropoutPriorThreadProcessor);

            // supervised part
            for (int m = 0; m < totalData.Length; m++)
            {
                bool submitIsUnsup = (m >= unsupDropoutStartIndex);
                wrapper.Put(new Pair <int, bool>(m, submitIsUnsup));
                while (wrapper.Peek())
                {
                    Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > result = wrapper.Poll();
                    int  docIndex = result.First();
                    bool isUnsup  = docIndex >= unsupDropoutStartIndex;
                    if (isUnsup)
                    {
                        prob += unsupDropoutScale * result.Second();
                    }
                    else
                    {
                        prob += result.Second();
                    }
                    IDictionary <int, double[]> partialDropout = result.Fourth();
                    if (partialDropout != null)
                    {
                        if (isUnsup)
                        {
                            Combine2DArr(dropoutPriorGradTotal, partialDropout, unsupDropoutScale);
                        }
                        else
                        {
                            Combine2DArr(dropoutPriorGradTotal, partialDropout);
                        }
                    }
                    if (!isUnsup)
                    {
                        IDictionary <int, double[]> partialE = result.Third();
                        if (partialE != null)
                        {
                            Combine2DArr(E, partialE);
                        }
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > result = wrapper.Poll();
                int  docIndex = result.First();
                bool isUnsup  = docIndex >= unsupDropoutStartIndex;
                if (isUnsup)
                {
                    prob += unsupDropoutScale * result.Second();
                }
                else
                {
                    prob += result.Second();
                }
                IDictionary <int, double[]> partialDropout = result.Fourth();
                if (partialDropout != null)
                {
                    if (isUnsup)
                    {
                        Combine2DArr(dropoutPriorGradTotal, partialDropout, unsupDropoutScale);
                    }
                    else
                    {
                        Combine2DArr(dropoutPriorGradTotal, partialDropout);
                    }
                }
                if (!isUnsup)
                {
                    IDictionary <int, double[]> partialE = result.Third();
                    if (partialE != null)
                    {
                        Combine2DArr(E, partialE);
                    }
                }
            }
            if (double.IsNaN(prob))
            {
                // shouldn't be the case
                throw new Exception("Got NaN for prob in CRFLogConditionalObjectiveFunctionWithDropout.calculate()" + " - this may well indicate numeric underflow due to overly long documents.");
            }
            // because we minimize -L(\theta)
            value = -prob;
            if (Verbose)
            {
                log.Info("value is " + System.Math.Exp(-value));
            }
            // compute the partial derivative for each feature by comparing expected counts to empirical counts
            int index = 0;

            for (int i = 0; i < E.Length; i++)
            {
                for (int j = 0; j < E[i].Length; j++)
                {
                    // because we minimize -L(\theta)
                    derivative[index]  = (E[i][j] - Ehat[i][j]);
                    derivative[index] += dropoutScale * dropoutPriorGradTotal[i][j];
                    if (Verbose)
                    {
                        log.Info("deriv(" + i + ',' + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index]);
                    }
                    index++;
                }
            }
        }
Example #14
0
        /// <summary>Test on a file containing correct tags already.</summary>
        /// <remarks>
        /// Test on a file containing correct tags already. when init'ing from trees
        /// TODO: Add the ability to have a second transformer to transform output back; possibly combine this method
        /// with method below
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        private void Test()
        {
            numSentences    = 0;
            confusionMatrix = new ConfusionMatrix <string>();
            PrintFile pf  = null;
            PrintFile pf1 = null;
            PrintFile pf3 = null;

            if (writeWords)
            {
                pf = new PrintFile(saveRoot + ".words");
            }
            if (writeUnknDict)
            {
                pf1 = new PrintFile(saveRoot + ".un.dict");
            }
            if (writeTopWords)
            {
                pf3 = new PrintFile(saveRoot + ".words.top");
            }
            bool verboseResults = config.GetVerboseResults();

            if (config.GetNThreads() != 1)
            {
                MulticoreWrapper <IList <TaggedWord>, TestSentence> wrapper = new MulticoreWrapper <IList <TaggedWord>, TestSentence>(config.GetNThreads(), new TestClassifier.TestSentenceProcessor(maxentTagger));
                foreach (IList <TaggedWord> taggedSentence in fileRecord.Reader())
                {
                    wrapper.Put(taggedSentence);
                    while (wrapper.Peek())
                    {
                        ProcessResults(wrapper.Poll(), pf, pf1, pf3, verboseResults);
                    }
                }
                wrapper.Join();
                while (wrapper.Peek())
                {
                    ProcessResults(wrapper.Poll(), pf, pf1, pf3, verboseResults);
                }
            }
            else
            {
                foreach (IList <TaggedWord> taggedSentence in fileRecord.Reader())
                {
                    TestSentence testS = new TestSentence(maxentTagger);
                    testS.SetCorrectTags(taggedSentence);
                    testS.TagSentence(taggedSentence, false);
                    ProcessResults(testS, pf, pf1, pf3, verboseResults);
                }
            }
            if (pf != null)
            {
                pf.Close();
            }
            if (pf1 != null)
            {
                pf1.Close();
            }
            if (pf3 != null)
            {
                pf3.Close();
            }
        }
Example #15
0
        protected internal virtual double MultiThreadGradient(IList <int> docIDs, bool calculateEmpirical)
        {
            double objective = 0.0;

            // TODO: This is a bunch of unnecessary heap traffic, should all be on the stack
            if (multiThreadGrad > 1)
            {
                if (parallelE == null)
                {
                    parallelE = new double[multiThreadGrad][][];
                    for (int i = 0; i < multiThreadGrad; i++)
                    {
                        parallelE[i] = Empty2D();
                    }
                }
                if (calculateEmpirical)
                {
                    if (parallelEhat == null)
                    {
                        parallelEhat = new double[multiThreadGrad][][];
                        for (int i = 0; i < multiThreadGrad; i++)
                        {
                            parallelEhat[i] = Empty2D();
                        }
                    }
                }
            }
            // TODO: this is a huge amount of machinery for no discernible reason
            MulticoreWrapper <Pair <int, IList <int> >, Pair <int, double> > wrapper = new MulticoreWrapper <Pair <int, IList <int> >, Pair <int, double> >(multiThreadGrad, (calculateEmpirical ? expectedAndEmpiricalThreadProcessor : expectedThreadProcessor));
            int totalLen  = docIDs.Count;
            int partLen   = totalLen / multiThreadGrad;
            int currIndex = 0;

            for (int part = 0; part < multiThreadGrad; part++)
            {
                int endIndex = currIndex + partLen;
                if (part == multiThreadGrad - 1)
                {
                    endIndex = totalLen;
                }
                // TODO: let's not construct a sub-list of DocIDs, unnecessary object creation, can calculate directly from ThreadID
                IList <int> subList = docIDs.SubList(currIndex, endIndex);
                wrapper.Put(new Pair <int, IList <int> >(part, subList));
                currIndex = endIndex;
            }
            wrapper.Join();
            // This all seems fine. May want to start running this after the joins, in case we have different end-times
            while (wrapper.Peek())
            {
                Pair <int, double> result = wrapper.Poll();
                int tID = result.First();
                objective += result.Second();
                if (multiThreadGrad > 1)
                {
                    Combine2DArr(E, parallelE[tID]);
                    if (calculateEmpirical)
                    {
                        Combine2DArr(Ehat, parallelEhat[tID]);
                    }
                }
            }
            return(objective);
        }
 public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                      )
 where _T0 : IHasWord
     {
      DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
      if (op.testOptions.verbose)
     {
         if (tokenizerFactory != null)
         {
             pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
         }
     }
      Timing timer = new Timing();
      // timer.start(); // constructor already starts it.
      //Loop over the files
      for (int i = argIndex; i < args.Length; i++)
     {
         string filename = args[i];
         DocumentPreprocessor documentPreprocessor;
         if (filename.Equals("-"))
         {
             try
             {
                 documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType);
             }
             catch (IOException e)
             {
                 throw new RuntimeIOException(e);
             }
         }
         else
         {
             documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding());
         }
         //Unused values are null per the main() method invocation below
         //null is the default for these properties
         documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords());
         documentPreprocessor.SetEscaper(escaper);
         documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
         documentPreprocessor.SetTagDelimiter(tagDelimiter);
         documentPreprocessor.SetElementDelimiter(elementDelimiter);
         if (tokenizerFactory == null)
         {
             documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory());
         }
         else
         {
             documentPreprocessor.SetTokenizerFactory(tokenizerFactory);
         }
         //Setup the output
         PrintWriter pwo = pwOut;
         if (op.testOptions.writeOutputFiles)
         {
             string normalizedName = filename;
             try
             {
                 new URL(normalizedName);
                 // this will exception if not a URL
                 normalizedName = normalizedName.ReplaceAll("/", "_");
             }
             catch (MalformedURLException)
             {
             }
             //It isn't a URL, so silently ignore
             string ext   = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
             string fname = normalizedName + '.' + ext;
             if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty())
             {
                 string fseparator = Runtime.GetProperty("file.separator");
                 if (fseparator == null || fseparator.IsEmpty())
                 {
                     fseparator = "/";
                 }
                 File fnameFile = new File(fname);
                 fname          = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName();
             }
             try
             {
                 pwo = op.tlpParams.Pw(new FileOutputStream(fname));
             }
             catch (IOException ioe)
             {
                 throw new RuntimeIOException(ioe);
             }
         }
         treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding());
         pwErr.Println("Parsing file: " + filename);
         int num          = 0;
         int numProcessed = 0;
         if (op.testOptions.testingThreads != 1)
         {
             MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     IParserQuery pq = wrapper.Poll();
                     ProcessResults(pq, numProcessed++, pwo);
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 IParserQuery pq = wrapper.Poll();
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         else
         {
             IParserQuery pq = pqFactory.ParserQuery();
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 pq.ParseAndReport(sentence, pwErr);
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         treePrint.PrintFooter(pwo);
         if (op.testOptions.writeOutputFiles)
         {
             pwo.Close();
         }
         pwErr.Println("Parsed file: " + filename + " [" + num + " sentences].");
     }
      long millis = timer.Stop();
      if (summary)
     {
         if (pcfgLL != null)
         {
             pcfgLL.Display(false, pwErr);
         }
         if (depLL != null)
         {
             depLL.Display(false, pwErr);
         }
         if (factLL != null)
         {
             factLL.Display(false, pwErr);
         }
     }
      if (saidMemMessage)
     {
         ParserUtils.PrintOutOfMemory(pwErr);
     }
      double wordspersec = numWords / (((double)millis) / 1000);
      double sentspersec = numSents / (((double)millis) / 1000);
      NumberFormat nf    = new DecimalFormat("0.00");
      // easier way!
      pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec).");
      if (numFallback > 0)
     {
         pwErr.Println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
     }
      if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0)
     {
         pwErr.Println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
         if (numUnparsable > 0)
         {
             pwErr.Println("    " + numUnparsable + " were not parsable with non-zero probability.");
         }
         if (numNoMemory > 0)
         {
             pwErr.Println("    " + numNoMemory + " were skipped because of insufficient memory.");
         }
         if (numSkipped > 0)
         {
             pwErr.Println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
         }
     }
     }
Example #17
0
        /// <summary>
        /// An example of a command line is
        /// <br />
        /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel = null;
            string output      = null;
            IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList();
            int dvKBest    = 200;
            int numThreads = 1;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest"))
                {
                    dvKBest   = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model"))
                {
                    parserModel = args[argIndex + 1];
                    argIndex   += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    output    = args[argIndex + 1];
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                {
                    Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    treebanks.Add(treebankDescription);
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads"))
                {
                    numThreads = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex  += 2;
                    continue;
                }
                throw new ArgumentException("Unknown argument " + args[argIndex]);
            }
            if (parserModel == null)
            {
                throw new ArgumentException("Need to supply a parser model with -model");
            }
            if (output == null)
            {
                throw new ArgumentException("Need to supply an output filename with -output");
            }
            if (treebanks.IsEmpty())
            {
                throw new ArgumentException("Need to supply a treebank with -treebank");
            }
            log.Info("Writing output to " + output);
            log.Info("Loading parser model " + parserModel);
            log.Info("Writing " + dvKBest + " hypothesis trees for each tree");
            LexicalizedParser    parser      = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest)));
            CacheParseHypotheses cacher      = new CacheParseHypotheses(parser);
            ITreeTransformer     transformer = DVParser.BuildTrainTransformer(parser.GetOp());
            IList <Tree>         sentences   = new List <Tree>();

            foreach (Pair <string, IFileFilter> description in treebanks)
            {
                log.Info("Reading trees from " + description.first);
                Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(description.first, description.second);
                treebank = treebank.Transform(transformer);
                Sharpen.Collections.AddAll(sentences, treebank);
            }
            log.Info("Processing " + sentences.Count + " trees");
            IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList();

            transformer = new SynchronizedTreeTransformer(transformer);
            MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer));

            foreach (Tree tree in sentences)
            {
                wrapper.Put(tree);
                while (wrapper.Peek())
                {
                    cache.Add(wrapper.Poll());
                    if (cache.Count % 10 == 0)
                    {
                        System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                cache.Add(wrapper.Poll());
                if (cache.Count % 10 == 0)
                {
                    System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                }
            }
            System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees");
            IOUtils.WriteObjectToFile(cache, output);
        }
        /// <exception cref="System.Exception"/>
        public static void RunCoref(Properties props)
        {
            /*
             * property, environment setting
             */
            Redwood.HideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
            int    nThreads  = HybridCorefProperties.GetThreadCounts(props);
            string timeStamp = Calendar.GetInstance().GetTime().ToString().ReplaceAll("\\s", "-").ReplaceAll(":", "-");
            Logger logger    = Logger.GetLogger(typeof(Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem).FullName);

            // set log file path
            if (props.Contains(HybridCorefProperties.LogProp))
            {
                File logFile = new File(props.GetProperty(HybridCorefProperties.LogProp));
                RedwoodConfiguration.Current().Handlers(RedwoodConfiguration.Handlers.File(logFile)).Apply();
                Redwood.Log("Starting coref log");
            }
            log.Info(props.ToString());
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem cs = new Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem(props);

            /*
             * output setting
             */
            // prepare conll output
            string      goldOutput        = null;
            string      beforeCorefOutput = null;
            string      afterCorefOutput  = null;
            PrintWriter writerGold        = null;
            PrintWriter writerBeforeCoref = null;
            PrintWriter writerAfterCoref  = null;

            if (HybridCorefProperties.DoScore(props))
            {
                string pathOutput = CorefProperties.ConllOutputPath(props);
                (new File(pathOutput)).Mkdir();
                goldOutput        = pathOutput + "output-" + timeStamp + ".gold.txt";
                beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
                afterCorefOutput  = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
                writerGold        = new PrintWriter(new FileOutputStream(goldOutput));
                writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
                writerAfterCoref  = new PrintWriter(new FileOutputStream(afterCorefOutput));
            }
            // run coref
            MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]>(nThreads, new _IThreadsafeProcessor_134
                                                                                                                                                                                                                                        ());
            // conll output and logs
            DateTime startTime = null;

            if (HybridCorefProperties.CheckTime(props))
            {
                startTime = new DateTime();
                System.Console.Error.Printf("END-TO-END COREF Start time: %s\n", startTime);
            }
            // run processes
            int docCnt = 0;

            while (true)
            {
                Document document = cs.docMaker.NextDoc();
                if (document == null)
                {
                    break;
                }
                wrapper.Put(Pair.MakePair(document, cs));
                docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            }
            // Finished reading the input. Wait for jobs to finish
            wrapper.Join();
            docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            IOUtils.CloseIgnoringExceptions(writerGold);
            IOUtils.CloseIgnoringExceptions(writerBeforeCoref);
            IOUtils.CloseIgnoringExceptions(writerAfterCoref);
            if (HybridCorefProperties.CheckTime(props))
            {
                System.Console.Error.Printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new DateTime()).GetTime() - startTime.GetTime()) / 1000F));
            }
            //      System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            // scoring
            if (HybridCorefProperties.DoScore(props))
            {
                string summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, beforeCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, false);
                summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, afterCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, true);
                CorefScorer.PrintFinalConllScore(summary);
            }
        }