/// <summary>Train a segmenter from raw text.</summary> /// <remarks>Train a segmenter from raw text. Gold segmentation markers are required.</remarks> public virtual void Train() { bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, noRewrites, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.trainFile, docReader); classifier.Train(lines, docReader); log.Info("Finished training."); }
/// <summary> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. /// </summary> /// <remarks> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. In other words, the evaluation file has the /// same format as the training data. /// </remarks> /// <param name="pwOut"/> private void Evaluate(PrintWriter pwOut) { log.Info("Starting evaluation..."); bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.testFile, docReader); PrintWriter tedEvalGoldTree = null; PrintWriter tedEvalParseTree = null; PrintWriter tedEvalGoldSeg = null; PrintWriter tedEvalParseSeg = null; if (tedEvalPrefix != null) { try { tedEvalGoldTree = new PrintWriter(tedEvalPrefix + "_gold.ftree"); tedEvalGoldSeg = new PrintWriter(tedEvalPrefix + "_gold.segmentation"); tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree"); tedEvalParseSeg = new PrintWriter(tedEvalPrefix + "_parse.segmentation"); } catch (FileNotFoundException e) { System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message); } } ICounter <string> labelTotal = new ClassicCounter <string>(); ICounter <string> labelCorrect = new ClassicCounter <string>(); int total = 0; int correct = 0; foreach (IList <CoreLabel> line in lines) { string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" "); string[] goldTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); line = classifier.Classify(line); string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); foreach (CoreLabel label in line) { // Do not evaluate labeling of whitespace string observation = label.Get(typeof(CoreAnnotations.CharAnnotation)); if (!observation.Equals(IOBUtils.GetBoundaryCharacter())) { total++; string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation)); string reference = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)); labelTotal.IncrementCount(reference); if (hypothesis.Equals(reference)) { correct++; labelCorrect.IncrementCount(reference); } } } if (tedEvalParseSeg != null) { tedEvalGoldTree.Printf("(root"); tedEvalParseTree.Printf("(root"); int safeLength = inputTokens.Length; if (inputTokens.Length != goldTokens.Length) { log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" gold: " + Arrays.ToString(goldTokens)); safeLength = Math.Min(inputTokens.Length, goldTokens.Length); } if (inputTokens.Length != parseTokens.Length) { log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" parse: " + Arrays.ToString(parseTokens)); safeLength = Math.Min(inputTokens.Length, parseTokens.Length); } for (int i = 0; i < safeLength; i++) { foreach (string segment in goldTokens[i].Split(":")) { tedEvalGoldTree.Printf(" (seg %s)", segment); } tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]); foreach (string segment_1 in parseTokens[i].Split(":")) { tedEvalParseTree.Printf(" (seg %s)", segment_1); } tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]); } tedEvalGoldTree.Printf(")%n"); tedEvalGoldSeg.Println(); tedEvalParseTree.Printf(")%n"); tedEvalParseSeg.Println(); } } double accuracy = ((double)correct) / ((double)total); accuracy *= 100.0; pwOut.Println("EVALUATION RESULTS"); pwOut.Printf("#datums:\t%d%n", total); pwOut.Printf("#correct:\t%d%n", correct); pwOut.Printf("accuracy:\t%.2f%n", accuracy); pwOut.Println("=================="); // Output the per label accuracies pwOut.Println("PER LABEL ACCURACIES"); foreach (string refLabel in labelTotal.KeySet()) { double nTotal = labelTotal.GetCount(refLabel); double nCorrect = labelCorrect.GetCount(refLabel); double acc = (nCorrect / nTotal) * 100.0; pwOut.Printf(" %s\t%.2f%n", refLabel, acc); } if (tedEvalParseSeg != null) { tedEvalGoldTree.Close(); tedEvalGoldSeg.Close(); tedEvalParseTree.Close(); tedEvalParseSeg.Close(); } }
public _ISerializableFunction_131(ArabicDocumentReaderAndWriter _enclosing) { this._enclosing = _enclosing; this.serialVersionUID = 5243251505653686497L; }
public _ISerializableFunction_131(ArabicDocumentReaderAndWriter _enclosing) { this._enclosing = _enclosing; this.serialVersionUID = serialVersionUID; }