Esempio n. 1
0
        /// <summary>Train a segmenter from raw text.</summary>
        /// <remarks>Train a segmenter from raw text. Gold segmentation markers are required.</remarks>
        public virtual void Train()
        {
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, noRewrites, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.trainFile, docReader);

            classifier.Train(lines, docReader);
            log.Info("Finished training.");
        }
Esempio n. 2
0
        /// <summary>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses.
        /// </summary>
        /// <remarks>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses. In other words, the evaluation file has the
        /// same format as the training data.
        /// </remarks>
        /// <param name="pwOut"/>
        private void Evaluate(PrintWriter pwOut)
        {
            log.Info("Starting evaluation...");
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.testFile, docReader);
            PrintWriter tedEvalGoldTree  = null;
            PrintWriter tedEvalParseTree = null;
            PrintWriter tedEvalGoldSeg   = null;
            PrintWriter tedEvalParseSeg  = null;

            if (tedEvalPrefix != null)
            {
                try
                {
                    tedEvalGoldTree  = new PrintWriter(tedEvalPrefix + "_gold.ftree");
                    tedEvalGoldSeg   = new PrintWriter(tedEvalPrefix + "_gold.segmentation");
                    tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree");
                    tedEvalParseSeg  = new PrintWriter(tedEvalPrefix + "_parse.segmentation");
                }
                catch (FileNotFoundException e)
                {
                    System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message);
                }
            }
            ICounter <string> labelTotal   = new ClassicCounter <string>();
            ICounter <string> labelCorrect = new ClassicCounter <string>();
            int total   = 0;
            int correct = 0;

            foreach (IList <CoreLabel> line in lines)
            {
                string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" ");
                string[] goldTokens  = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                line = classifier.Classify(line);
                string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                foreach (CoreLabel label in line)
                {
                    // Do not evaluate labeling of whitespace
                    string observation = label.Get(typeof(CoreAnnotations.CharAnnotation));
                    if (!observation.Equals(IOBUtils.GetBoundaryCharacter()))
                    {
                        total++;
                        string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation));
                        string reference  = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation));
                        labelTotal.IncrementCount(reference);
                        if (hypothesis.Equals(reference))
                        {
                            correct++;
                            labelCorrect.IncrementCount(reference);
                        }
                    }
                }
                if (tedEvalParseSeg != null)
                {
                    tedEvalGoldTree.Printf("(root");
                    tedEvalParseTree.Printf("(root");
                    int safeLength = inputTokens.Length;
                    if (inputTokens.Length != goldTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  gold: " + Arrays.ToString(goldTokens));
                        safeLength = Math.Min(inputTokens.Length, goldTokens.Length);
                    }
                    if (inputTokens.Length != parseTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  parse: " + Arrays.ToString(parseTokens));
                        safeLength = Math.Min(inputTokens.Length, parseTokens.Length);
                    }
                    for (int i = 0; i < safeLength; i++)
                    {
                        foreach (string segment in goldTokens[i].Split(":"))
                        {
                            tedEvalGoldTree.Printf(" (seg %s)", segment);
                        }
                        tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]);
                        foreach (string segment_1 in parseTokens[i].Split(":"))
                        {
                            tedEvalParseTree.Printf(" (seg %s)", segment_1);
                        }
                        tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]);
                    }
                    tedEvalGoldTree.Printf(")%n");
                    tedEvalGoldSeg.Println();
                    tedEvalParseTree.Printf(")%n");
                    tedEvalParseSeg.Println();
                }
            }
            double accuracy = ((double)correct) / ((double)total);

            accuracy *= 100.0;
            pwOut.Println("EVALUATION RESULTS");
            pwOut.Printf("#datums:\t%d%n", total);
            pwOut.Printf("#correct:\t%d%n", correct);
            pwOut.Printf("accuracy:\t%.2f%n", accuracy);
            pwOut.Println("==================");
            // Output the per label accuracies
            pwOut.Println("PER LABEL ACCURACIES");
            foreach (string refLabel in labelTotal.KeySet())
            {
                double nTotal   = labelTotal.GetCount(refLabel);
                double nCorrect = labelCorrect.GetCount(refLabel);
                double acc      = (nCorrect / nTotal) * 100.0;
                pwOut.Printf(" %s\t%.2f%n", refLabel, acc);
            }
            if (tedEvalParseSeg != null)
            {
                tedEvalGoldTree.Close();
                tedEvalGoldSeg.Close();
                tedEvalParseTree.Close();
                tedEvalParseSeg.Close();
            }
        }
Esempio n. 3
0
 public _ISerializableFunction_131(ArabicDocumentReaderAndWriter _enclosing)
 {
     this._enclosing       = _enclosing;
     this.serialVersionUID = 5243251505653686497L;
 }
 public _ISerializableFunction_131(ArabicDocumentReaderAndWriter _enclosing)
 {
     this._enclosing       = _enclosing;
     this.serialVersionUID = serialVersionUID;
 }