public virtual string SegmentString(string line) { IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); string segmentedString = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker); return(segmentedString); }
private IList <CoreLabel> SegmentStringToIOB(string line) { IList <CoreLabel> tokenList; if (tf == null) { // Whitespace tokenization. tokenList = IOBUtils.StringToIOB(line); } else { IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize(); tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line); } IOBUtils.LabelDomain(tokenList, domain); tokenList = classifier.Classify(tokenList); return(tokenList); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
/// <summary> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. /// </summary> /// <remarks> /// Evaluate accuracy when the input is gold segmented text *with* segmentation /// markers and morphological analyses. In other words, the evaluation file has the /// same format as the training data. /// </remarks> /// <param name="pwOut"/> private void Evaluate(PrintWriter pwOut) { log.Info("Starting evaluation..."); bool hasSegmentationMarkers = true; bool hasTags = true; IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf); ObjectBank <IList <CoreLabel> > lines = classifier.MakeObjectBankFromFile(flags.testFile, docReader); PrintWriter tedEvalGoldTree = null; PrintWriter tedEvalParseTree = null; PrintWriter tedEvalGoldSeg = null; PrintWriter tedEvalParseSeg = null; if (tedEvalPrefix != null) { try { tedEvalGoldTree = new PrintWriter(tedEvalPrefix + "_gold.ftree"); tedEvalGoldSeg = new PrintWriter(tedEvalPrefix + "_gold.segmentation"); tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree"); tedEvalParseSeg = new PrintWriter(tedEvalPrefix + "_parse.segmentation"); } catch (FileNotFoundException e) { System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message); } } ICounter <string> labelTotal = new ClassicCounter <string>(); ICounter <string> labelCorrect = new ClassicCounter <string>(); int total = 0; int correct = 0; foreach (IList <CoreLabel> line in lines) { string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" "); string[] goldTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); line = classifier.Classify(line); string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" "); foreach (CoreLabel label in line) { // Do not evaluate labeling of whitespace string observation = label.Get(typeof(CoreAnnotations.CharAnnotation)); if (!observation.Equals(IOBUtils.GetBoundaryCharacter())) { total++; string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation)); string reference = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)); labelTotal.IncrementCount(reference); if (hypothesis.Equals(reference)) { correct++; labelCorrect.IncrementCount(reference); } } } if (tedEvalParseSeg != null) { tedEvalGoldTree.Printf("(root"); tedEvalParseTree.Printf("(root"); int safeLength = inputTokens.Length; if (inputTokens.Length != goldTokens.Length) { log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" gold: " + Arrays.ToString(goldTokens)); safeLength = Math.Min(inputTokens.Length, goldTokens.Length); } if (inputTokens.Length != parseTokens.Length) { log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens"); log.Info(" (ignoring any extras)"); log.Info(" input: " + Arrays.ToString(inputTokens)); log.Info(" parse: " + Arrays.ToString(parseTokens)); safeLength = Math.Min(inputTokens.Length, parseTokens.Length); } for (int i = 0; i < safeLength; i++) { foreach (string segment in goldTokens[i].Split(":")) { tedEvalGoldTree.Printf(" (seg %s)", segment); } tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]); foreach (string segment_1 in parseTokens[i].Split(":")) { tedEvalParseTree.Printf(" (seg %s)", segment_1); } tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]); } tedEvalGoldTree.Printf(")%n"); tedEvalGoldSeg.Println(); tedEvalParseTree.Printf(")%n"); tedEvalParseSeg.Println(); } } double accuracy = ((double)correct) / ((double)total); accuracy *= 100.0; pwOut.Println("EVALUATION RESULTS"); pwOut.Printf("#datums:\t%d%n", total); pwOut.Printf("#correct:\t%d%n", correct); pwOut.Printf("accuracy:\t%.2f%n", accuracy); pwOut.Println("=================="); // Output the per label accuracies pwOut.Println("PER LABEL ACCURACIES"); foreach (string refLabel in labelTotal.KeySet()) { double nTotal = labelTotal.GetCount(refLabel); double nCorrect = labelCorrect.GetCount(refLabel); double acc = (nCorrect / nTotal) * 100.0; pwOut.Printf(" %s\t%.2f%n", refLabel, acc); } if (tedEvalParseSeg != null) { tedEvalGoldTree.Close(); tedEvalGoldSeg.Close(); tedEvalParseTree.Close(); tedEvalParseSeg.Close(); } }
public IList <CoreLabel> Apply(string @in) { IList <CoreLabel> tokenList; string lineDomain = string.Empty; if (this._enclosing.inputHasDomainLabels) { string[] domainAndData = @in.Split("\\s+", 2); if (domainAndData.Length < 2) { ArabicDocumentReaderAndWriter.log.Info("Missing domain label or text: "); ArabicDocumentReaderAndWriter.log.Info(@in); } else { lineDomain = domainAndData[0]; @in = domainAndData[1]; } } else { lineDomain = this._enclosing.inputDomain; } if (this._enclosing.inputHasTags) { string[] toks = @in.Split("\\s+"); IList <CoreLabel> input = new List <CoreLabel>(toks.Length); string tagDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.tagDelimiter); string rewDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.rewriteDelimiter); foreach (string wordTag in toks) { string[] wordTagPair = wordTag.Split(tagDelim); System.Diagnostics.Debug.Assert(wordTagPair.Length == 2); string[] rewritePair = wordTagPair[0].Split(rewDelim); System.Diagnostics.Debug.Assert(rewritePair.Length == 1 || rewritePair.Length == 2); string raw = rewritePair[0]; string rewritten = raw; if (rewritePair.Length == 2) { rewritten = rewritePair[1]; } CoreLabel cl = new CoreLabel(); if (this._enclosing.tf != null) { IList <CoreLabel> lexListRaw = this._enclosing.tf.GetTokenizer(new StringReader(raw)).Tokenize(); IList <CoreLabel> lexListRewritten = this._enclosing.tf.GetTokenizer(new StringReader(rewritten)).Tokenize(); if (lexListRewritten.Count != lexListRaw.Count) { System.Console.Error.Printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.GetType().FullName, raw, rewritten); lexListRewritten = lexListRaw; } if (lexListRaw.IsEmpty()) { continue; } else { if (lexListRaw.Count == 1) { raw = lexListRaw[0].Value(); rewritten = lexListRewritten[0].Value(); } else { if (lexListRaw.Count > 1) { string secondWord = lexListRaw[1].Value(); if (secondWord.Equals(this._enclosing.segMarker.ToString())) { // Special case for the null marker in the vocalized section raw = lexListRaw[0].Value() + this._enclosing.segMarker; rewritten = lexListRewritten[0].Value() + this._enclosing.segMarker; } else { System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", this.GetType().FullName, raw); raw = lexListRaw[0].Value(); rewritten = lexListRewritten[0].Value(); } } } } } cl.SetValue(raw); cl.SetWord(raw); cl.SetTag(wordTagPair[1]); cl.Set(typeof(CoreAnnotations.DomainAnnotation), lineDomain); cl.Set(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation), rewritten); input.Add(cl); } tokenList = IOBUtils.StringToIOB(input, this._enclosing.segMarker, true, this._enclosing.shouldStripRewrites); } else { if (this._enclosing.tf == null) { tokenList = IOBUtils.StringToIOB(@in, this._enclosing.segMarker); } else { IList <CoreLabel> line = this._enclosing.tf.GetTokenizer(new StringReader(@in)).Tokenize(); tokenList = IOBUtils.StringToIOB(line, this._enclosing.segMarker, false); } } if (this._enclosing.inputHasDomainLabels && !this._enclosing.inputHasTags) { IOBUtils.LabelDomain(tokenList, lineDomain); } else { if (!this._enclosing.inputHasDomainLabels) { IOBUtils.LabelDomain(tokenList, this._enclosing.inputDomain); } } return(tokenList); }