public virtual void Annotate(Annotation annotation)
 {
     // turn the annotation into a sentence
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         if (nThreads == 1)
         {
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 DoOneSentence(sentence);
             }
         }
         else
         {
             MulticoreWrapper <ICoreMap, ICoreMap> wrapper = new MulticoreWrapper <ICoreMap, ICoreMap>(nThreads, new POSTaggerAnnotator.POSTaggerProcessor(this));
             foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
             {
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     wrapper.Poll();
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 wrapper.Poll();
             }
         }
     }
     else
     {
         throw new Exception("unable to find words/tokens in: " + annotation);
     }
 }
Esempio n. 2
0
 /// <summary>
 /// Instantiate a classifier with training data and randomly
 /// initialized parameter matrices in order to begin training.
 /// </summary>
 /// <param name="config"/>
 /// <param name="dataset"/>
 /// <param name="E"/>
 /// <param name="W1"/>
 /// <param name="b1"/>
 /// <param name="W2"/>
 /// <param name="preComputed"/>
 public Classifier(Config config, Dataset dataset, double[][] E, double[][] W1, double[] b1, double[][] W2, IList <int> preComputed)
 {
     // E: numFeatures x embeddingSize
     // W1: hiddenSize x (embeddingSize x numFeatures)
     // b1: hiddenSize
     // W2: numLabels x hiddenSize
     // Weight matrices
     // Global gradSaved
     // Gradient histories
     this.config  = config;
     this.dataset = dataset;
     this.E       = E;
     this.W1      = W1;
     this.b1      = b1;
     this.W2      = W2;
     InitGradientHistories();
     numLabels = W2.Length;
     preMap    = new Dictionary <int, int>();
     for (int i = 0; i < preComputed.Count && i < config.numPreComputed; ++i)
     {
         preMap[preComputed[i]] = i;
     }
     isTraining = dataset != null;
     if (isTraining)
     {
         jobHandler = new MulticoreWrapper <Pair <ICollection <Example>, Classifier.FeedforwardParams>, Classifier.Cost>(config.trainingThreads, new Classifier.CostFunction(this), false);
     }
     else
     {
         jobHandler = null;
     }
 }
        /// <summary>
        /// Fix tree structure, phrasal categories and part-of-speech labels in newly expanded
        /// multi-word tokens.
        /// </summary>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        private IList <Tree> FixMultiWordTokens(IList <Tree> trees)
        {
            bool ner = PropertiesUtils.GetBool(options, "ner", false);
            // Shared resources
            IFactory <TreeNormalizer> tnf = new _IFactory_389();
            ITreeFactory tf = new LabeledScoredTreeFactory();
            IThreadsafeProcessor <ICollection <Tree>, ICollection <Tree> > processor = new AnCoraProcessor.MultiWordProcessor(this, tnf, tf, ner);
            int availableProcessors = Runtime.GetRuntime().AvailableProcessors();
            MulticoreWrapper <ICollection <Tree>, ICollection <Tree> > wrapper = new MulticoreWrapper <ICollection <Tree>, ICollection <Tree> >(availableProcessors, processor, false);
            // Chunk our work so that parallelization is actually worth it
            int numChunks = availableProcessors * 20;
            IList <IList <Tree> > chunked = CollectionUtils.PartitionIntoFolds(trees, numChunks);
            IList <Tree>          ret     = new List <Tree>();

            foreach (ICollection <Tree> coll in chunked)
            {
                wrapper.Put(coll);
                while (wrapper.Peek())
                {
                    Sharpen.Collections.AddAll(ret, wrapper.Poll());
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Sharpen.Collections.AddAll(ret, wrapper.Poll());
            }
            return(ret);
        }
Esempio n. 4
0
 public static void RedoTags(IList <Tree> trees, Edu.Stanford.Nlp.Tagger.Common.Tagger tagger, int nThreads)
 {
     if (nThreads == 1)
     {
         foreach (Tree tree in trees)
         {
             RedoTags(tree, tagger);
         }
     }
     else
     {
         MulticoreWrapper <Tree, Tree> wrapper = new MulticoreWrapper <Tree, Tree>(nThreads, new ShiftReduceParser.RetagProcessor(tagger));
         foreach (Tree tree in trees)
         {
             wrapper.Put(tree);
         }
         wrapper.Join();
     }
 }
Esempio n. 5
0
        public static IdentityHashMap <Tree, IList <Tree> > ConvertToTrees(ICollection <Tree> keys, IdentityHashMap <Tree, byte[]> compressed, int numThreads)
        {
            IdentityHashMap <Tree, IList <Tree> >    uncompressed = Generics.NewIdentityHashMap();
            MulticoreWrapper <byte[], IList <Tree> > wrapper      = new MulticoreWrapper <byte[], IList <Tree> >(numThreads, new CacheParseHypotheses.DecompressionProcessor());

            foreach (Tree tree in keys)
            {
                wrapper.Put(compressed[tree]);
            }
            foreach (Tree tree_1 in keys)
            {
                if (!wrapper.Peek())
                {
                    wrapper.Join();
                }
                uncompressed[tree_1] = wrapper.Poll();
            }
            return(uncompressed);
        }
 /// <summary>Write output of coref system in conll format, and log.</summary>
 private static int LogOutput(MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]> wrapper, PrintWriter writerGold, PrintWriter writerBeforeCoref, PrintWriter writerAfterCoref, int docCnt)
 {
     while (wrapper.Peek())
     {
         StringBuilder[] output = wrapper.Poll();
         writerGold.Print(output[0]);
         writerBeforeCoref.Print(output[1]);
         writerAfterCoref.Print(output[2]);
         if (output[3].Length > 0)
         {
             log.Info(output[3]);
         }
         if ((++docCnt) % 10 == 0)
         {
             log.Info(docCnt + " document(s) processed");
         }
     }
     return(docCnt);
 }
Esempio n. 7
0
        /// <summary>Segment input and write to output stream.</summary>
        /// <param name="segmenter"/>
        /// <param name="br"/>
        /// <param name="pwOut"/>
        /// <param name="nThreads"/>
        /// <returns>input characters processed per second</returns>
        private static double Decode(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads)
        {
            System.Diagnostics.Debug.Assert(nThreads > 0);
            long nChars    = 0;
            long startTime = Runtime.NanoTime();

            if (nThreads > 1)
            {
                MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, segmenter);
                try
                {
                    for (string line; (line = br.ReadLine()) != null;)
                    {
                        nChars += line.Length;
                        wrapper.Put(line);
                        while (wrapper.Peek())
                        {
                            pwOut.Println(wrapper.Poll());
                        }
                    }
                    wrapper.Join();
                    while (wrapper.Peek())
                    {
                        pwOut.Println(wrapper.Poll());
                    }
                }
                catch (IOException e)
                {
                    log.Warn(e);
                }
            }
            else
            {
                nChars = segmenter.Segment(br, pwOut);
            }
            long   duration    = Runtime.NanoTime() - startTime;
            double charsPerSec = (double)nChars / (duration / 1000000000.0);

            return(charsPerSec);
        }
Esempio n. 8
0
 // static main
 /// <param name="args">Command-line arguments: modelFile (runs as a filter from stdin to stdout)</param>
 public static void Main(string[] args)
 {
     if (args.Length != 1)
     {
         System.Console.Error.Printf("Usage: java %s model_file < input_file%n", typeof(Edu.Stanford.Nlp.Tagger.Maxent.Documentation.MulticoreWrapperDemo).FullName);
         System.Environment.Exit(-1);
     }
     try
     {
         // Load MaxentTagger, which is threadsafe
         string       modelFile = args[0];
         MaxentTagger tagger    = new MaxentTagger(modelFile);
         // Configure to run with 4 worker threads
         int nThreads = 4;
         MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, new _IThreadsafeProcessor_42(tagger));
         // MaxentTagger is threadsafe
         // Submit jobs, which come from stdin
         BufferedReader br = new BufferedReader(new InputStreamReader(Runtime.@in));
         for (string line; (line = br.ReadLine()) != null;)
         {
             wrapper.Put(line);
             while (wrapper.Peek())
             {
                 System.Console.Out.WriteLine(wrapper.Poll());
             }
         }
         // Finished reading the input. Wait for jobs to finish
         wrapper.Join();
         while (wrapper.Peek())
         {
             System.Console.Out.WriteLine(wrapper.Poll());
         }
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Esempio n. 9
0
        /// <summary>
        /// Samples the complete sequence once in the forward direction
        /// Destructively modifies the sequence in place.
        /// </summary>
        /// <param name="sequence">the sequence to start with.</param>
        public virtual double SampleSequenceForward(ISequenceModel model, int[] sequence, double temperature, ICollection <int> onlySampleThesePositions)
        {
            double returnScore = double.NegativeInfinity;

            // log.info("Sampling forward");
            if (onlySampleThesePositions != null)
            {
                foreach (int pos in onlySampleThesePositions)
                {
                    returnScore = SamplePosition(model, sequence, pos, temperature);
                }
            }
            else
            {
                if (samplingStyle == SequentialSampling)
                {
                    for (int pos = 0; pos < sequence.Length; pos++)
                    {
                        returnScore = SamplePosition(model, sequence, pos, temperature);
                    }
                }
                else
                {
                    if (samplingStyle == RandomSampling)
                    {
                        foreach (int aSequence in sequence)
                        {
                            int pos = random.NextInt(sequence.Length);
                            returnScore = SamplePosition(model, sequence, pos, temperature);
                        }
                    }
                    else
                    {
                        if (samplingStyle == ChromaticSampling)
                        {
                            // make copies of the sequences and merge at the end
                            IList <Pair <int, int> > results = new List <Pair <int, int> >();
                            foreach (IList <int> indieList in partition)
                            {
                                if (indieList.Count <= chromaticSize)
                                {
                                    foreach (int pos in indieList)
                                    {
                                        Pair <int, double> newPosProb = SamplePositionHelper(model, sequence, pos, temperature);
                                        sequence[pos] = newPosProb.First();
                                    }
                                }
                                else
                                {
                                    MulticoreWrapper <IList <int>, IList <Pair <int, int> > > wrapper = new MulticoreWrapper <IList <int>, IList <Pair <int, int> > >(chromaticSize, new _IThreadsafeProcessor_269(this, model, sequence, temperature));
                                    // returns the position to sample in first place and new label in second place
                                    results.Clear();
                                    int interval = System.Math.Max(1, indieList.Count / chromaticSize);
                                    for (int begin = 0; end < indieListSize; begin += interval)
                                    {
                                        end = System.Math.Min(begin + interval, indieListSize);
                                        wrapper.Put(indieList.SubList(begin, end));
                                        while (wrapper.Peek())
                                        {
                                            Sharpen.Collections.AddAll(results, wrapper.Poll());
                                        }
                                    }
                                    wrapper.Join();
                                    while (wrapper.Peek())
                                    {
                                        Sharpen.Collections.AddAll(results, wrapper.Poll());
                                    }
                                    foreach (Pair <int, int> posVal in results)
                                    {
                                        sequence[posVal.First()] = posVal.Second();
                                    }
                                }
                            }
                            returnScore = model.ScoreOf(sequence);
                        }
                    }
                }
            }
            return(returnScore);
        }
        protected internal override void Calculate(double[] theta)
        {
            model.VectorToParams(theta);
            SentimentCostAndGradient.ModelDerivatives derivatives;
            if (model.op.trainOptions.nThreads == 1)
            {
                derivatives = ScoreDerivatives(trainingBatch);
            }
            else
            {
                // TODO: because some addition operations happen in different
                // orders now, this results in slightly different values, which
                // over time add up to significantly different models even when
                // given the same random seed.  Probably not a big deal.
                // To be more specific, for trees T1, T2, T3, ... Tn,
                // when using one thread, we sum the derivatives T1 + T2 ...
                // When using multiple threads, we first sum T1 + ... + Tk,
                // then sum Tk+1 + ... + T2k, etc, for split size k.
                // The splits are then summed in order.
                // This different sum order results in slightly different numbers.
                MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives> wrapper = new MulticoreWrapper <IList <Tree>, SentimentCostAndGradient.ModelDerivatives>(model.op.trainOptions.nThreads, new SentimentCostAndGradient.ScoringProcessor(this
                                                                                                                                                                                                                                                                  ));
                // use wrapper.nThreads in case the number of threads was automatically changed
                foreach (IList <Tree> chunk in CollectionUtils.PartitionIntoFolds(trainingBatch, wrapper.NThreads()))
                {
                    wrapper.Put(chunk);
                }
                wrapper.Join();
                derivatives = new SentimentCostAndGradient.ModelDerivatives(model);
                while (wrapper.Peek())
                {
                    SentimentCostAndGradient.ModelDerivatives batchDerivatives = wrapper.Poll();
                    derivatives.Add(batchDerivatives);
                }
            }
            // scale the error by the number of sentences so that the
            // regularization isn't drowned out for large training batchs
            double scale = (1.0 / trainingBatch.Count);

            value      = derivatives.error * scale;
            value     += ScaleAndRegularize(derivatives.binaryTD, model.binaryTransform, scale, model.op.trainOptions.regTransformMatrix, false);
            value     += ScaleAndRegularize(derivatives.binaryCD, model.binaryClassification, scale, model.op.trainOptions.regClassification, true);
            value     += ScaleAndRegularizeTensor(derivatives.binaryTensorTD, model.binaryTensors, scale, model.op.trainOptions.regTransformTensor);
            value     += ScaleAndRegularize(derivatives.unaryCD, model.unaryClassification, scale, model.op.trainOptions.regClassification, false, true);
            value     += ScaleAndRegularize(derivatives.wordVectorD, model.wordVectors, scale, model.op.trainOptions.regWordVector, true, false);
            derivative = NeuralUtils.ParamsToVector(theta.Length, derivatives.binaryTD.ValueIterator(), derivatives.binaryCD.ValueIterator(), SimpleTensor.IteratorSimpleMatrix(derivatives.binaryTensorTD.ValueIterator()), derivatives.unaryCD.Values.GetEnumerator
                                                        (), derivatives.wordVectorD.Values.GetEnumerator());
        }
        /// <summary>Test the parser on a treebank.</summary>
        /// <remarks>
        /// Test the parser on a treebank. Parses will be written to stdout, and
        /// various other information will be written to stderr and stdout,
        /// particularly if <code>op.testOptions.verbose</code> is true.
        /// </remarks>
        /// <param name="testTreebank">The treebank to parse</param>
        /// <returns>
        /// The labeled precision/recall F<sub>1</sub> (EVALB measure)
        /// of the parser on the treebank.
        /// </returns>
        public virtual double TestOnTreebank(Treebank testTreebank)
        {
            log.Info("Testing on treebank");
            Timing    treebankTotalTimer        = new Timing();
            TreePrint treePrint                 = op.testOptions.TreePrint(op.tlpParams);
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = op.Langpack();
            PrintWriter pwOut;
            PrintWriter pwErr;

            if (op.testOptions.quietEvaluation)
            {
                NullOutputStream quiet = new NullOutputStream();
                pwOut = tlpParams.Pw(quiet);
                pwErr = tlpParams.Pw(quiet);
            }
            else
            {
                pwOut = tlpParams.Pw();
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (op.testOptions.verbose)
            {
                pwErr.Print("Testing ");
                pwErr.Println(testTreebank.TextualSummary(tlp));
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(tlpParams);
            }
            PrintWriter pwFileOut = null;

            if (op.testOptions.writeOutputFiles)
            {
                string fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
                try
                {
                    pwFileOut = op.tlpParams.Pw(new FileOutputStream(fname));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            PrintWriter pwStats = null;

            if (op.testOptions.outputkBestEquivocation != null)
            {
                try
                {
                    pwStats = op.tlpParams.Pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            if (op.testOptions.testingThreads != 1)
            {
                MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
                LinkedList <Tree> goldTrees = new LinkedList <Tree>();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <IHasWord> sentence = GetInputSentence(goldTree);
                    goldTrees.Add(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    wrapper.Put(sentence);
                    while (wrapper.Peek())
                    {
                        IParserQuery pq = wrapper.Poll();
                        goldTree = goldTrees.Poll();
                        ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                    }
                }
                // for tree iterator
                wrapper.Join();
                while (wrapper.Peek())
                {
                    IParserQuery pq         = wrapper.Poll();
                    Tree         goldTree_1 = goldTrees.Poll();
                    ProcessResults(pq, goldTree_1, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            else
            {
                IParserQuery pq = pqFactory.ParserQuery();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <CoreLabel> sentence = GetInputSentence(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    pq.ParseAndReport(sentence, pwErr);
                    ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            // for tree iterator
            //Done parsing...print the results of the evaluations
            treebankTotalTimer.Done("Testing on treebank");
            if (op.testOptions.quietEvaluation)
            {
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (saidMemMessage)
            {
                ParserUtils.PrintOutOfMemory(pwErr);
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            if (numSkippedEvals != 0)
            {
                pwErr.Printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
            }
            // only created here so we know what parser types are supported...
            IParserQuery pq_1 = pqFactory.ParserQuery();

            if (summary)
            {
                if (pcfgLB != null)
                {
                    pcfgLB.Display(false, pwErr);
                }
                if (pcfgChildSpecific != null)
                {
                    pcfgChildSpecific.Display(false, pwErr);
                }
                if (pcfgLA != null)
                {
                    pcfgLA.Display(false, pwErr);
                }
                if (pcfgCB != null)
                {
                    pcfgCB.Display(false, pwErr);
                }
                if (pcfgDA != null)
                {
                    pcfgDA.Display(false, pwErr);
                }
                if (pcfgTA != null)
                {
                    pcfgTA.Display(false, pwErr);
                }
                if (pcfgLL != null && pq_1.GetPCFGParser() != null)
                {
                    pcfgLL.Display(false, pwErr);
                }
                if (depDA != null)
                {
                    depDA.Display(false, pwErr);
                }
                if (depTA != null)
                {
                    depTA.Display(false, pwErr);
                }
                if (depLL != null && pq_1.GetDependencyParser() != null)
                {
                    depLL.Display(false, pwErr);
                }
                if (factLB != null)
                {
                    factLB.Display(false, pwErr);
                }
                if (factChildSpecific != null)
                {
                    factChildSpecific.Display(false, pwErr);
                }
                if (factLA != null)
                {
                    factLA.Display(false, pwErr);
                }
                if (factCB != null)
                {
                    factCB.Display(false, pwErr);
                }
                if (factDA != null)
                {
                    factDA.Display(false, pwErr);
                }
                if (factTA != null)
                {
                    factTA.Display(false, pwErr);
                }
                if (factLL != null && pq_1.GetFactoredParser() != null)
                {
                    factLL.Display(false, pwErr);
                }
                if (pcfgCatE != null)
                {
                    pcfgCatE.Display(false, pwErr);
                }
                foreach (IEval eval in evals)
                {
                    eval.Display(false, pwErr);
                }
                foreach (BestOfTopKEval eval_1 in topKEvals)
                {
                    eval_1.Display(false, pwErr);
                }
            }
            // these ones only have a display mode, so display if turned on!!
            if (pcfgRUO != null)
            {
                pcfgRUO.Display(true, pwErr);
            }
            if (pcfgCUO != null)
            {
                pcfgCUO.Display(true, pwErr);
            }
            if (tsv)
            {
                NumberFormat nf = new DecimalFormat("0.00");
                pwErr.Println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && factDA != null)
                {
                    pwErr.Print(nf.Format(factDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetExactPercent()));
                }
                pwErr.Print("\t");
                if (pcfgLB != null)
                {
                    pwErr.Print(nf.Format(pcfgLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && depDA != null)
                {
                    pwErr.Print(nf.Format(depDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetPCFGParser() != null && factTA != null)
                {
                    pwErr.Print(nf.Format(factTA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(factLB.GetNum());
                }
                pwErr.Println();
            }
            double f1 = 0.0;

            if (factLB != null)
            {
                f1 = factLB.GetEvalbF1();
            }
            //Close files (if necessary)
            if (pwFileOut != null)
            {
                pwFileOut.Close();
            }
            if (pwStats != null)
            {
                pwStats.Close();
            }
            if (parserQueryEvals != null)
            {
                foreach (IParserQueryEval parserQueryEval in parserQueryEvals)
                {
                    parserQueryEval.Display(false, pwErr);
                }
            }
            return(f1);
        }
        /// <summary>Calculates both value and partial derivatives at the point x, and save them internally.</summary>
        protected internal override void Calculate(double[] x)
        {
            double prob = 0.0;

            // the log prob of the sequence given the model, which is the negation of value at this point
            // final double[][] weights = to2D(x);
            To2D(x, weights);
            SetWeights(weights);
            // the expectations over counts
            // first index is feature index, second index is of possible labeling
            // double[][] E = empty2D();
            Clear2D(E);
            Clear2D(dropoutPriorGradTotal);
            MulticoreWrapper <Pair <int, bool>, Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > > wrapper = new MulticoreWrapper <Pair <int, bool>, Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > >
                                                                                                                                                   (multiThreadGrad, dropoutPriorThreadProcessor);

            // supervised part
            for (int m = 0; m < totalData.Length; m++)
            {
                bool submitIsUnsup = (m >= unsupDropoutStartIndex);
                wrapper.Put(new Pair <int, bool>(m, submitIsUnsup));
                while (wrapper.Peek())
                {
                    Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > result = wrapper.Poll();
                    int  docIndex = result.First();
                    bool isUnsup  = docIndex >= unsupDropoutStartIndex;
                    if (isUnsup)
                    {
                        prob += unsupDropoutScale * result.Second();
                    }
                    else
                    {
                        prob += result.Second();
                    }
                    IDictionary <int, double[]> partialDropout = result.Fourth();
                    if (partialDropout != null)
                    {
                        if (isUnsup)
                        {
                            Combine2DArr(dropoutPriorGradTotal, partialDropout, unsupDropoutScale);
                        }
                        else
                        {
                            Combine2DArr(dropoutPriorGradTotal, partialDropout);
                        }
                    }
                    if (!isUnsup)
                    {
                        IDictionary <int, double[]> partialE = result.Third();
                        if (partialE != null)
                        {
                            Combine2DArr(E, partialE);
                        }
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > result = wrapper.Poll();
                int  docIndex = result.First();
                bool isUnsup  = docIndex >= unsupDropoutStartIndex;
                if (isUnsup)
                {
                    prob += unsupDropoutScale * result.Second();
                }
                else
                {
                    prob += result.Second();
                }
                IDictionary <int, double[]> partialDropout = result.Fourth();
                if (partialDropout != null)
                {
                    if (isUnsup)
                    {
                        Combine2DArr(dropoutPriorGradTotal, partialDropout, unsupDropoutScale);
                    }
                    else
                    {
                        Combine2DArr(dropoutPriorGradTotal, partialDropout);
                    }
                }
                if (!isUnsup)
                {
                    IDictionary <int, double[]> partialE = result.Third();
                    if (partialE != null)
                    {
                        Combine2DArr(E, partialE);
                    }
                }
            }
            if (double.IsNaN(prob))
            {
                // shouldn't be the case
                throw new Exception("Got NaN for prob in CRFLogConditionalObjectiveFunctionWithDropout.calculate()" + " - this may well indicate numeric underflow due to overly long documents.");
            }
            // because we minimize -L(\theta)
            value = -prob;
            if (Verbose)
            {
                log.Info("value is " + System.Math.Exp(-value));
            }
            // compute the partial derivative for each feature by comparing expected counts to empirical counts
            int index = 0;

            for (int i = 0; i < E.Length; i++)
            {
                for (int j = 0; j < E[i].Length; j++)
                {
                    // because we minimize -L(\theta)
                    derivative[index]  = (E[i][j] - Ehat[i][j]);
                    derivative[index] += dropoutScale * dropoutPriorGradTotal[i][j];
                    if (Verbose)
                    {
                        log.Info("deriv(" + i + ',' + j + ") = " + E[i][j] + " - " + Ehat[i][j] + " = " + derivative[index]);
                    }
                    index++;
                }
            }
        }
Esempio n. 13
0
        private void TrainModel(string serializedPath, Edu.Stanford.Nlp.Tagger.Common.Tagger tagger, Random random, IList <Tree> binarizedTrees, IList <IList <ITransition> > transitionLists, Treebank devTreebank, int nThreads, ICollection <string> allowedFeatures
                                )
        {
            double bestScore     = 0.0;
            int    bestIteration = 0;
            PriorityQueue <ScoredObject <PerceptronModel> > bestModels = null;

            if (op.TrainOptions().averagedModels > 0)
            {
                bestModels = new PriorityQueue <ScoredObject <PerceptronModel> >(op.TrainOptions().averagedModels + 1, ScoredComparator.AscendingComparator);
            }
            IList <int> indices = Generics.NewArrayList();

            for (int i = 0; i < binarizedTrees.Count; ++i)
            {
                indices.Add(i);
            }
            Oracle oracle = null;

            if (op.TrainOptions().trainingMethod == ShiftReduceTrainOptions.TrainingMethod.Oracle)
            {
                oracle = new Oracle(binarizedTrees, op.compoundUnaries, rootStates);
            }
            IList <PerceptronModel.Update>           updates = Generics.NewArrayList();
            MulticoreWrapper <int, Pair <int, int> > wrapper = null;

            if (nThreads != 1)
            {
                updates = Java.Util.Collections.SynchronizedList(updates);
                wrapper = new MulticoreWrapper <int, Pair <int, int> >(op.trainOptions.trainingThreads, new PerceptronModel.TrainTreeProcessor(this, binarizedTrees, transitionLists, updates, oracle));
            }
            IntCounter <string> featureFrequencies = null;

            if (op.TrainOptions().featureFrequencyCutoff > 1)
            {
                featureFrequencies = new IntCounter <string>();
            }
            for (int iteration = 1; iteration <= op.trainOptions.trainingIterations; ++iteration)
            {
                Timing trainingTimer = new Timing();
                int    numCorrect    = 0;
                int    numWrong      = 0;
                Java.Util.Collections.Shuffle(indices, random);
                for (int start = 0; start < indices.Count; start += op.trainOptions.batchSize)
                {
                    int end = Math.Min(start + op.trainOptions.batchSize, indices.Count);
                    Triple <IList <PerceptronModel.Update>, int, int> result = TrainBatch(indices.SubList(start, end), binarizedTrees, transitionLists, updates, oracle, wrapper);
                    numCorrect += result.second;
                    numWrong   += result.third;
                    foreach (PerceptronModel.Update update in result.first)
                    {
                        foreach (string feature in update.features)
                        {
                            if (allowedFeatures != null && !allowedFeatures.Contains(feature))
                            {
                                continue;
                            }
                            Weight weights = featureWeights[feature];
                            if (weights == null)
                            {
                                weights = new Weight();
                                featureWeights[feature] = weights;
                            }
                            weights.UpdateWeight(update.goldTransition, update.delta);
                            weights.UpdateWeight(update.predictedTransition, -update.delta);
                            if (featureFrequencies != null)
                            {
                                featureFrequencies.IncrementCount(feature, (update.goldTransition >= 0 && update.predictedTransition >= 0) ? 2 : 1);
                            }
                        }
                    }
                    updates.Clear();
                }
                trainingTimer.Done("Iteration " + iteration);
                log.Info("While training, got " + numCorrect + " transitions correct and " + numWrong + " transitions wrong");
                OutputStats();
                double labelF1 = 0.0;
                if (devTreebank != null)
                {
                    EvaluateTreebank evaluator = new EvaluateTreebank(op, null, new ShiftReduceParser(op, this), tagger);
                    evaluator.TestOnTreebank(devTreebank);
                    labelF1 = evaluator.GetLBScore();
                    log.Info("Label F1 after " + iteration + " iterations: " + labelF1);
                    if (labelF1 > bestScore)
                    {
                        log.Info("New best dev score (previous best " + bestScore + ")");
                        bestScore     = labelF1;
                        bestIteration = iteration;
                    }
                    else
                    {
                        log.Info("Failed to improve for " + (iteration - bestIteration) + " iteration(s) on previous best score of " + bestScore);
                        if (op.trainOptions.stalledIterationLimit > 0 && (iteration - bestIteration >= op.trainOptions.stalledIterationLimit))
                        {
                            log.Info("Failed to improve for too long, stopping training");
                            break;
                        }
                    }
                    log.Info();
                    if (bestModels != null)
                    {
                        bestModels.Add(new ScoredObject <PerceptronModel>(new PerceptronModel(this), labelF1));
                        if (bestModels.Count > op.TrainOptions().averagedModels)
                        {
                            bestModels.Poll();
                        }
                    }
                }
                if (op.TrainOptions().saveIntermediateModels&& serializedPath != null && op.trainOptions.debugOutputFrequency > 0)
                {
                    string            tempName = Sharpen.Runtime.Substring(serializedPath, 0, serializedPath.Length - 7) + "-" + Filename.Format(iteration) + "-" + Nf.Format(labelF1) + ".ser.gz";
                    ShiftReduceParser temp     = new ShiftReduceParser(op, this);
                    temp.SaveModel(tempName);
                }
                // TODO: we could save a cutoff version of the model,
                // especially if we also get a dev set number for it, but that
                // might be overkill
                if (iteration % 10 == 0 && op.TrainOptions().decayLearningRate > 0.0)
                {
                    learningRate *= op.TrainOptions().decayLearningRate;
                }
            }
            // end for iterations
            if (wrapper != null)
            {
                wrapper.Join();
            }
            if (bestModels != null)
            {
                if (op.TrainOptions().cvAveragedModels&& devTreebank != null)
                {
                    IList <ScoredObject <PerceptronModel> > models = Generics.NewArrayList();
                    while (bestModels.Count > 0)
                    {
                        models.Add(bestModels.Poll());
                    }
                    Java.Util.Collections.Reverse(models);
                    double bestF1   = 0.0;
                    int    bestSize = 0;
                    for (int i_1 = 1; i_1 <= models.Count; ++i_1)
                    {
                        log.Info("Testing with " + i_1 + " models averaged together");
                        // TODO: this is kind of ugly, would prefer a separate object
                        AverageScoredModels(models.SubList(0, i_1));
                        ShiftReduceParser temp      = new ShiftReduceParser(op, this);
                        EvaluateTreebank  evaluator = new EvaluateTreebank(temp.GetOp(), null, temp, tagger);
                        evaluator.TestOnTreebank(devTreebank);
                        double labelF1 = evaluator.GetLBScore();
                        log.Info("Label F1 for " + i_1 + " models: " + labelF1);
                        if (labelF1 > bestF1)
                        {
                            bestF1   = labelF1;
                            bestSize = i_1;
                        }
                    }
                    AverageScoredModels(models.SubList(0, bestSize));
                }
                else
                {
                    AverageScoredModels(bestModels);
                }
            }
            // TODO: perhaps we should filter the features and then get dev
            // set scores.  That way we can merge the models which are best
            // after filtering.
            if (featureFrequencies != null)
            {
                FilterFeatures(featureFrequencies.KeysAbove(op.TrainOptions().featureFrequencyCutoff));
            }
            CondenseFeatures();
        }
Esempio n. 14
0
        /// <summary>
        /// Trains a batch of trees and returns the following: a list of
        /// Update objects, the number of transitions correct, and the number
        /// of transitions wrong.
        /// </summary>
        /// <remarks>
        /// Trains a batch of trees and returns the following: a list of
        /// Update objects, the number of transitions correct, and the number
        /// of transitions wrong.
        /// If the model is trained with multiple threads, it is expected
        /// that a valid MulticoreWrapper is passed in which does the
        /// processing.  In that case, the processing is done on all of the
        /// trees without updating any weights, which allows the results for
        /// multithreaded training to be reproduced.
        /// </remarks>
        private Triple <IList <PerceptronModel.Update>, int, int> TrainBatch(IList <int> indices, IList <Tree> binarizedTrees, IList <IList <ITransition> > transitionLists, IList <PerceptronModel.Update> updates, Oracle oracle, MulticoreWrapper <int, Pair <int
                                                                                                                                                                                                                                                                 , int> > wrapper)
        {
            int numCorrect = 0;
            int numWrong   = 0;

            if (op.trainOptions.trainingThreads == 1)
            {
                foreach (int index in indices)
                {
                    Pair <int, int> count = TrainTree(index, binarizedTrees, transitionLists, updates, oracle);
                    numCorrect += count.first;
                    numWrong   += count.second;
                }
            }
            else
            {
                foreach (int index in indices)
                {
                    wrapper.Put(index);
                }
                wrapper.Join(false);
                while (wrapper.Peek())
                {
                    Pair <int, int> result = wrapper.Poll();
                    numCorrect += result.first;
                    numWrong   += result.second;
                }
            }
            return(new Triple <IList <PerceptronModel.Update>, int, int>(updates, numCorrect, numWrong));
        }
Esempio n. 15
0
        protected internal virtual double MultiThreadGradient(IList <int> docIDs, bool calculateEmpirical)
        {
            double objective = 0.0;

            // TODO: This is a bunch of unnecessary heap traffic, should all be on the stack
            if (multiThreadGrad > 1)
            {
                if (parallelE == null)
                {
                    parallelE = new double[multiThreadGrad][][];
                    for (int i = 0; i < multiThreadGrad; i++)
                    {
                        parallelE[i] = Empty2D();
                    }
                }
                if (calculateEmpirical)
                {
                    if (parallelEhat == null)
                    {
                        parallelEhat = new double[multiThreadGrad][][];
                        for (int i = 0; i < multiThreadGrad; i++)
                        {
                            parallelEhat[i] = Empty2D();
                        }
                    }
                }
            }
            // TODO: this is a huge amount of machinery for no discernible reason
            MulticoreWrapper <Pair <int, IList <int> >, Pair <int, double> > wrapper = new MulticoreWrapper <Pair <int, IList <int> >, Pair <int, double> >(multiThreadGrad, (calculateEmpirical ? expectedAndEmpiricalThreadProcessor : expectedThreadProcessor));
            int totalLen  = docIDs.Count;
            int partLen   = totalLen / multiThreadGrad;
            int currIndex = 0;

            for (int part = 0; part < multiThreadGrad; part++)
            {
                int endIndex = currIndex + partLen;
                if (part == multiThreadGrad - 1)
                {
                    endIndex = totalLen;
                }
                // TODO: let's not construct a sub-list of DocIDs, unnecessary object creation, can calculate directly from ThreadID
                IList <int> subList = docIDs.SubList(currIndex, endIndex);
                wrapper.Put(new Pair <int, IList <int> >(part, subList));
                currIndex = endIndex;
            }
            wrapper.Join();
            // This all seems fine. May want to start running this after the joins, in case we have different end-times
            while (wrapper.Peek())
            {
                Pair <int, double> result = wrapper.Poll();
                int tID = result.First();
                objective += result.Second();
                if (multiThreadGrad > 1)
                {
                    Combine2DArr(E, parallelE[tID]);
                    if (calculateEmpirical)
                    {
                        Combine2DArr(Ehat, parallelEhat[tID]);
                    }
                }
            }
            return(objective);
        }
 public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                      )
 where _T0 : IHasWord
     {
      DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
      if (op.testOptions.verbose)
     {
         if (tokenizerFactory != null)
         {
             pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
         }
     }
      Timing timer = new Timing();
      // timer.start(); // constructor already starts it.
      //Loop over the files
      for (int i = argIndex; i < args.Length; i++)
     {
         string filename = args[i];
         DocumentPreprocessor documentPreprocessor;
         if (filename.Equals("-"))
         {
             try
             {
                 documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType);
             }
             catch (IOException e)
             {
                 throw new RuntimeIOException(e);
             }
         }
         else
         {
             documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding());
         }
         //Unused values are null per the main() method invocation below
         //null is the default for these properties
         documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords());
         documentPreprocessor.SetEscaper(escaper);
         documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
         documentPreprocessor.SetTagDelimiter(tagDelimiter);
         documentPreprocessor.SetElementDelimiter(elementDelimiter);
         if (tokenizerFactory == null)
         {
             documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory());
         }
         else
         {
             documentPreprocessor.SetTokenizerFactory(tokenizerFactory);
         }
         //Setup the output
         PrintWriter pwo = pwOut;
         if (op.testOptions.writeOutputFiles)
         {
             string normalizedName = filename;
             try
             {
                 new URL(normalizedName);
                 // this will exception if not a URL
                 normalizedName = normalizedName.ReplaceAll("/", "_");
             }
             catch (MalformedURLException)
             {
             }
             //It isn't a URL, so silently ignore
             string ext   = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
             string fname = normalizedName + '.' + ext;
             if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty())
             {
                 string fseparator = Runtime.GetProperty("file.separator");
                 if (fseparator == null || fseparator.IsEmpty())
                 {
                     fseparator = "/";
                 }
                 File fnameFile = new File(fname);
                 fname          = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName();
             }
             try
             {
                 pwo = op.tlpParams.Pw(new FileOutputStream(fname));
             }
             catch (IOException ioe)
             {
                 throw new RuntimeIOException(ioe);
             }
         }
         treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding());
         pwErr.Println("Parsing file: " + filename);
         int num          = 0;
         int numProcessed = 0;
         if (op.testOptions.testingThreads != 1)
         {
             MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     IParserQuery pq = wrapper.Poll();
                     ProcessResults(pq, numProcessed++, pwo);
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 IParserQuery pq = wrapper.Poll();
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         else
         {
             IParserQuery pq = pqFactory.ParserQuery();
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 pq.ParseAndReport(sentence, pwErr);
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         treePrint.PrintFooter(pwo);
         if (op.testOptions.writeOutputFiles)
         {
             pwo.Close();
         }
         pwErr.Println("Parsed file: " + filename + " [" + num + " sentences].");
     }
      long millis = timer.Stop();
      if (summary)
     {
         if (pcfgLL != null)
         {
             pcfgLL.Display(false, pwErr);
         }
         if (depLL != null)
         {
             depLL.Display(false, pwErr);
         }
         if (factLL != null)
         {
             factLL.Display(false, pwErr);
         }
     }
      if (saidMemMessage)
     {
         ParserUtils.PrintOutOfMemory(pwErr);
     }
      double wordspersec = numWords / (((double)millis) / 1000);
      double sentspersec = numSents / (((double)millis) / 1000);
      NumberFormat nf    = new DecimalFormat("0.00");
      // easier way!
      pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec).");
      if (numFallback > 0)
     {
         pwErr.Println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
     }
      if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0)
     {
         pwErr.Println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
         if (numUnparsable > 0)
         {
             pwErr.Println("    " + numUnparsable + " were not parsable with non-zero probability.");
         }
         if (numNoMemory > 0)
         {
             pwErr.Println("    " + numNoMemory + " were skipped because of insufficient memory.");
         }
         if (numSkipped > 0)
         {
             pwErr.Println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
         }
     }
     }
Esempio n. 17
0
        /// <summary>
        /// An example of a command line is
        /// <br />
        /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel = null;
            string output      = null;
            IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList();
            int dvKBest    = 200;
            int numThreads = 1;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest"))
                {
                    dvKBest   = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model"))
                {
                    parserModel = args[argIndex + 1];
                    argIndex   += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    output    = args[argIndex + 1];
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                {
                    Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    treebanks.Add(treebankDescription);
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads"))
                {
                    numThreads = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex  += 2;
                    continue;
                }
                throw new ArgumentException("Unknown argument " + args[argIndex]);
            }
            if (parserModel == null)
            {
                throw new ArgumentException("Need to supply a parser model with -model");
            }
            if (output == null)
            {
                throw new ArgumentException("Need to supply an output filename with -output");
            }
            if (treebanks.IsEmpty())
            {
                throw new ArgumentException("Need to supply a treebank with -treebank");
            }
            log.Info("Writing output to " + output);
            log.Info("Loading parser model " + parserModel);
            log.Info("Writing " + dvKBest + " hypothesis trees for each tree");
            LexicalizedParser    parser      = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest)));
            CacheParseHypotheses cacher      = new CacheParseHypotheses(parser);
            ITreeTransformer     transformer = DVParser.BuildTrainTransformer(parser.GetOp());
            IList <Tree>         sentences   = new List <Tree>();

            foreach (Pair <string, IFileFilter> description in treebanks)
            {
                log.Info("Reading trees from " + description.first);
                Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(description.first, description.second);
                treebank = treebank.Transform(transformer);
                Sharpen.Collections.AddAll(sentences, treebank);
            }
            log.Info("Processing " + sentences.Count + " trees");
            IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList();

            transformer = new SynchronizedTreeTransformer(transformer);
            MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer));

            foreach (Tree tree in sentences)
            {
                wrapper.Put(tree);
                while (wrapper.Peek())
                {
                    cache.Add(wrapper.Poll());
                    if (cache.Count % 10 == 0)
                    {
                        System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                cache.Add(wrapper.Poll());
                if (cache.Count % 10 == 0)
                {
                    System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                }
            }
            System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees");
            IOUtils.WriteObjectToFile(cache, output);
        }
Esempio n. 18
0
        // fill value & derivative
        protected internal override void Calculate(double[] theta)
        {
            dvModel.VectorToParams(theta);
            double localValue = 0.0;

            double[] localDerivative = new double[theta.Length];
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryW_dfsB;

            binaryW_dfsG = TwoDimensionalMap.TreeMap();
            binaryW_dfsB = TwoDimensionalMap.TreeMap();
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesG;
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreDerivativesB;

            binaryScoreDerivativesG = TwoDimensionalMap.TreeMap();
            binaryScoreDerivativesB = TwoDimensionalMap.TreeMap();
            IDictionary <string, SimpleMatrix> unaryW_dfsG;
            IDictionary <string, SimpleMatrix> unaryW_dfsB;

            unaryW_dfsG = new SortedDictionary <string, SimpleMatrix>();
            unaryW_dfsB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesG;
            IDictionary <string, SimpleMatrix> unaryScoreDerivativesB;

            unaryScoreDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            unaryScoreDerivativesB = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesG = new SortedDictionary <string, SimpleMatrix>();
            IDictionary <string, SimpleMatrix> wordVectorDerivativesB = new SortedDictionary <string, SimpleMatrix>();

            foreach (TwoDimensionalMap.Entry <string, string, SimpleMatrix> entry in dvModel.binaryTransform)
            {
                int numRows = entry.GetValue().NumRows();
                int numCols = entry.GetValue().NumCols();
                binaryW_dfsG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryW_dfsB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(numRows, numCols));
                binaryScoreDerivativesG.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
                binaryScoreDerivativesB.Put(entry.GetFirstKey(), entry.GetSecondKey(), new SimpleMatrix(1, numRows));
            }
            foreach (KeyValuePair <string, SimpleMatrix> entry_1 in dvModel.unaryTransform)
            {
                int numRows = entry_1.Value.NumRows();
                int numCols = entry_1.Value.NumCols();
                unaryW_dfsG[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryW_dfsB[entry_1.Key]            = new SimpleMatrix(numRows, numCols);
                unaryScoreDerivativesG[entry_1.Key] = new SimpleMatrix(1, numRows);
                unaryScoreDerivativesB[entry_1.Key] = new SimpleMatrix(1, numRows);
            }
            if (op.trainOptions.trainWordVectors)
            {
                foreach (KeyValuePair <string, SimpleMatrix> entry_2 in dvModel.wordVectors)
                {
                    int numRows = entry_2.Value.NumRows();
                    int numCols = entry_2.Value.NumCols();
                    wordVectorDerivativesG[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                    wordVectorDerivativesB[entry_2.Key] = new SimpleMatrix(numRows, numCols);
                }
            }
            // Some optimization methods prints out a line without an end, so our
            // debugging statements are misaligned
            Timing scoreTiming = new Timing();

            scoreTiming.Doing("Scoring trees");
            int treeNum = 0;
            MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> > wrapper = new MulticoreWrapper <Tree, Pair <DeepTree, DeepTree> >(op.trainOptions.trainingThreads, new DVParserCostAndGradient.ScoringProcessor(this));

            foreach (Tree tree in trainingBatch)
            {
                wrapper.Put(tree);
            }
            wrapper.Join();
            scoreTiming.Done();
            while (wrapper.Peek())
            {
                Pair <DeepTree, DeepTree> result = wrapper.Poll();
                DeepTree      goldTree           = result.first;
                DeepTree      bestTree           = result.second;
                StringBuilder treeDebugLine      = new StringBuilder();
                Formatter     formatter          = new Formatter(treeDebugLine);
                bool          isDone             = (Math.Abs(bestTree.GetScore() - goldTree.GetScore()) <= 0.00001 || goldTree.GetScore() > bestTree.GetScore());
                string        done = isDone ? "done" : string.Empty;
                formatter.Format("Tree %6d Highest tree: %12.4f Correct tree: %12.4f %s", treeNum, bestTree.GetScore(), goldTree.GetScore(), done);
                log.Info(treeDebugLine.ToString());
                if (!isDone)
                {
                    // if the gold tree is better than the best hypothesis tree by
                    // a large enough margin, then the score difference will be 0
                    // and we ignore the tree
                    double valueDelta = bestTree.GetScore() - goldTree.GetScore();
                    //double valueDelta = Math.max(0.0, - scoreGold + bestScore);
                    localValue += valueDelta;
                    // get the context words for this tree - should be the same
                    // for either goldTree or bestTree
                    IList <string> words = GetContextWords(goldTree.GetTree());
                    // The derivatives affected by this tree are only based on the
                    // nodes present in this tree, eg not all matrix derivatives
                    // will be affected by this tree
                    BackpropDerivative(goldTree.GetTree(), words, goldTree.GetVectors(), binaryW_dfsG, unaryW_dfsG, binaryScoreDerivativesG, unaryScoreDerivativesG, wordVectorDerivativesG);
                    BackpropDerivative(bestTree.GetTree(), words, bestTree.GetVectors(), binaryW_dfsB, unaryW_dfsB, binaryScoreDerivativesB, unaryScoreDerivativesB, wordVectorDerivativesB);
                }
                ++treeNum;
            }
            double[] localDerivativeGood;
            double[] localDerivativeB;
            if (op.trainOptions.trainWordVectors)
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator(), wordVectorDerivativesG.Values
                                                                 .GetEnumerator());
                localDerivativeB = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator(), wordVectorDerivativesB.Values
                                                              .GetEnumerator());
            }
            else
            {
                localDerivativeGood = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsG.ValueIterator(), unaryW_dfsG.Values.GetEnumerator(), binaryScoreDerivativesG.ValueIterator(), unaryScoreDerivativesG.Values.GetEnumerator());
                localDerivativeB    = NeuralUtils.ParamsToVector(theta.Length, binaryW_dfsB.ValueIterator(), unaryW_dfsB.Values.GetEnumerator(), binaryScoreDerivativesB.ValueIterator(), unaryScoreDerivativesB.Values.GetEnumerator());
            }
            // correct - highest
            for (int i = 0; i < localDerivativeGood.Length; i++)
            {
                localDerivative[i] = localDerivativeB[i] - localDerivativeGood[i];
            }
            // TODO: this is where we would combine multiple costs if we had parallelized the calculation
            value      = localValue;
            derivative = localDerivative;
            // normalizing by training batch size
            value = (1.0 / trainingBatch.Count) * value;
            ArrayMath.MultiplyInPlace(derivative, (1.0 / trainingBatch.Count));
            // add regularization to cost:
            double[] currentParams = dvModel.ParamsToVector();
            double   regCost       = 0;

            foreach (double currentParam in currentParams)
            {
                regCost += currentParam * currentParam;
            }
            regCost = op.trainOptions.regCost * 0.5 * regCost;
            value  += regCost;
            // add regularization to gradient
            ArrayMath.MultiplyInPlace(currentParams, op.trainOptions.regCost);
            ArrayMath.PairwiseAddInPlace(derivative, currentParams);
        }
        /// <exception cref="System.Exception"/>
        public static void RunCoref(Properties props)
        {
            /*
             * property, environment setting
             */
            Redwood.HideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
            int    nThreads  = HybridCorefProperties.GetThreadCounts(props);
            string timeStamp = Calendar.GetInstance().GetTime().ToString().ReplaceAll("\\s", "-").ReplaceAll(":", "-");
            Logger logger    = Logger.GetLogger(typeof(Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem).FullName);

            // set log file path
            if (props.Contains(HybridCorefProperties.LogProp))
            {
                File logFile = new File(props.GetProperty(HybridCorefProperties.LogProp));
                RedwoodConfiguration.Current().Handlers(RedwoodConfiguration.Handlers.File(logFile)).Apply();
                Redwood.Log("Starting coref log");
            }
            log.Info(props.ToString());
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem cs = new Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem(props);

            /*
             * output setting
             */
            // prepare conll output
            string      goldOutput        = null;
            string      beforeCorefOutput = null;
            string      afterCorefOutput  = null;
            PrintWriter writerGold        = null;
            PrintWriter writerBeforeCoref = null;
            PrintWriter writerAfterCoref  = null;

            if (HybridCorefProperties.DoScore(props))
            {
                string pathOutput = CorefProperties.ConllOutputPath(props);
                (new File(pathOutput)).Mkdir();
                goldOutput        = pathOutput + "output-" + timeStamp + ".gold.txt";
                beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
                afterCorefOutput  = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
                writerGold        = new PrintWriter(new FileOutputStream(goldOutput));
                writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
                writerAfterCoref  = new PrintWriter(new FileOutputStream(afterCorefOutput));
            }
            // run coref
            MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]>(nThreads, new _IThreadsafeProcessor_134
                                                                                                                                                                                                                                        ());
            // conll output and logs
            DateTime startTime = null;

            if (HybridCorefProperties.CheckTime(props))
            {
                startTime = new DateTime();
                System.Console.Error.Printf("END-TO-END COREF Start time: %s\n", startTime);
            }
            // run processes
            int docCnt = 0;

            while (true)
            {
                Document document = cs.docMaker.NextDoc();
                if (document == null)
                {
                    break;
                }
                wrapper.Put(Pair.MakePair(document, cs));
                docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            }
            // Finished reading the input. Wait for jobs to finish
            wrapper.Join();
            docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            IOUtils.CloseIgnoringExceptions(writerGold);
            IOUtils.CloseIgnoringExceptions(writerBeforeCoref);
            IOUtils.CloseIgnoringExceptions(writerAfterCoref);
            if (HybridCorefProperties.CheckTime(props))
            {
                System.Console.Error.Printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new DateTime()).GetTime() - startTime.GetTime()) / 1000F));
            }
            //      System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            // scoring
            if (HybridCorefProperties.DoScore(props))
            {
                string summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, beforeCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, false);
                summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, afterCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, true);
                CorefScorer.PrintFinalConllScore(summary);
            }
        }
Esempio n. 20
0
        /// <summary>Test on a file containing correct tags already.</summary>
        /// <remarks>
        /// Test on a file containing correct tags already. when init'ing from trees
        /// TODO: Add the ability to have a second transformer to transform output back; possibly combine this method
        /// with method below
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        private void Test()
        {
            numSentences    = 0;
            confusionMatrix = new ConfusionMatrix <string>();
            PrintFile pf  = null;
            PrintFile pf1 = null;
            PrintFile pf3 = null;

            if (writeWords)
            {
                pf = new PrintFile(saveRoot + ".words");
            }
            if (writeUnknDict)
            {
                pf1 = new PrintFile(saveRoot + ".un.dict");
            }
            if (writeTopWords)
            {
                pf3 = new PrintFile(saveRoot + ".words.top");
            }
            bool verboseResults = config.GetVerboseResults();

            if (config.GetNThreads() != 1)
            {
                MulticoreWrapper <IList <TaggedWord>, TestSentence> wrapper = new MulticoreWrapper <IList <TaggedWord>, TestSentence>(config.GetNThreads(), new TestClassifier.TestSentenceProcessor(maxentTagger));
                foreach (IList <TaggedWord> taggedSentence in fileRecord.Reader())
                {
                    wrapper.Put(taggedSentence);
                    while (wrapper.Peek())
                    {
                        ProcessResults(wrapper.Poll(), pf, pf1, pf3, verboseResults);
                    }
                }
                wrapper.Join();
                while (wrapper.Peek())
                {
                    ProcessResults(wrapper.Poll(), pf, pf1, pf3, verboseResults);
                }
            }
            else
            {
                foreach (IList <TaggedWord> taggedSentence in fileRecord.Reader())
                {
                    TestSentence testS = new TestSentence(maxentTagger);
                    testS.SetCorrectTags(taggedSentence);
                    testS.TagSentence(taggedSentence, false);
                    ProcessResults(testS, pf, pf1, pf3, verboseResults);
                }
            }
            if (pf != null)
            {
                pf.Close();
            }
            if (pf1 != null)
            {
                pf1.Close();
            }
            if (pf3 != null)
            {
                pf3.Close();
            }
        }