Esempio n. 1
0
        public virtual string SegmentString(string line)
        {
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);
            string            segmentedString = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker);

            return(segmentedString);
        }
Esempio n. 2
0
        private IList <CoreLabel> SegmentStringToIOB(string line)
        {
            IList <CoreLabel> tokenList;

            if (tf == null)
            {
                // Whitespace tokenization.
                tokenList = IOBUtils.StringToIOB(line);
            }
            else
            {
                IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize();
                tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line);
            }
            IOBUtils.LabelDomain(tokenList, domain);
            tokenList = classifier.Classify(tokenList);
            return(tokenList);
        }
Esempio n. 3
0
        public virtual IList <CoreLabel> SegmentStringToTokenList(string line)
        {
            IList <CoreLabel> tokenList       = CollectionUtils.MakeList();
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);

            foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence))
            {
                CoreLabel token = new CoreLabel();
                string    text  = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget());
                token.SetWord(text);
                token.SetValue(text);
                token.Set(typeof(CoreAnnotations.TextAnnotation), text);
                token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1");
                int start = labeledSequence[span.GetSource()].BeginPosition();
                int end   = labeledSequence[span.GetTarget() - 1].EndPosition();
                token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end));
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start);
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                tokenList.Add(token);
            }
            return(tokenList);
        }
Esempio n. 4
0
        /// <summary>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses.
        /// </summary>
        /// <remarks>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses. In other words, the evaluation file has the
        /// same format as the training data.
        /// </remarks>
        /// <param name="pwOut"/>
        private void Evaluate(PrintWriter pwOut)
        {
            log.Info("Starting evaluation...");
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.testFile, docReader);
            PrintWriter tedEvalGoldTree  = null;
            PrintWriter tedEvalParseTree = null;
            PrintWriter tedEvalGoldSeg   = null;
            PrintWriter tedEvalParseSeg  = null;

            if (tedEvalPrefix != null)
            {
                try
                {
                    tedEvalGoldTree  = new PrintWriter(tedEvalPrefix + "_gold.ftree");
                    tedEvalGoldSeg   = new PrintWriter(tedEvalPrefix + "_gold.segmentation");
                    tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree");
                    tedEvalParseSeg  = new PrintWriter(tedEvalPrefix + "_parse.segmentation");
                }
                catch (FileNotFoundException e)
                {
                    System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message);
                }
            }
            ICounter <string> labelTotal   = new ClassicCounter <string>();
            ICounter <string> labelCorrect = new ClassicCounter <string>();
            int total   = 0;
            int correct = 0;

            foreach (IList <CoreLabel> line in lines)
            {
                string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" ");
                string[] goldTokens  = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                line = classifier.Classify(line);
                string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                foreach (CoreLabel label in line)
                {
                    // Do not evaluate labeling of whitespace
                    string observation = label.Get(typeof(CoreAnnotations.CharAnnotation));
                    if (!observation.Equals(IOBUtils.GetBoundaryCharacter()))
                    {
                        total++;
                        string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation));
                        string reference  = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation));
                        labelTotal.IncrementCount(reference);
                        if (hypothesis.Equals(reference))
                        {
                            correct++;
                            labelCorrect.IncrementCount(reference);
                        }
                    }
                }
                if (tedEvalParseSeg != null)
                {
                    tedEvalGoldTree.Printf("(root");
                    tedEvalParseTree.Printf("(root");
                    int safeLength = inputTokens.Length;
                    if (inputTokens.Length != goldTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  gold: " + Arrays.ToString(goldTokens));
                        safeLength = Math.Min(inputTokens.Length, goldTokens.Length);
                    }
                    if (inputTokens.Length != parseTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  parse: " + Arrays.ToString(parseTokens));
                        safeLength = Math.Min(inputTokens.Length, parseTokens.Length);
                    }
                    for (int i = 0; i < safeLength; i++)
                    {
                        foreach (string segment in goldTokens[i].Split(":"))
                        {
                            tedEvalGoldTree.Printf(" (seg %s)", segment);
                        }
                        tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]);
                        foreach (string segment_1 in parseTokens[i].Split(":"))
                        {
                            tedEvalParseTree.Printf(" (seg %s)", segment_1);
                        }
                        tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]);
                    }
                    tedEvalGoldTree.Printf(")%n");
                    tedEvalGoldSeg.Println();
                    tedEvalParseTree.Printf(")%n");
                    tedEvalParseSeg.Println();
                }
            }
            double accuracy = ((double)correct) / ((double)total);

            accuracy *= 100.0;
            pwOut.Println("EVALUATION RESULTS");
            pwOut.Printf("#datums:\t%d%n", total);
            pwOut.Printf("#correct:\t%d%n", correct);
            pwOut.Printf("accuracy:\t%.2f%n", accuracy);
            pwOut.Println("==================");
            // Output the per label accuracies
            pwOut.Println("PER LABEL ACCURACIES");
            foreach (string refLabel in labelTotal.KeySet())
            {
                double nTotal   = labelTotal.GetCount(refLabel);
                double nCorrect = labelCorrect.GetCount(refLabel);
                double acc      = (nCorrect / nTotal) * 100.0;
                pwOut.Printf(" %s\t%.2f%n", refLabel, acc);
            }
            if (tedEvalParseSeg != null)
            {
                tedEvalGoldTree.Close();
                tedEvalGoldSeg.Close();
                tedEvalParseTree.Close();
                tedEvalParseSeg.Close();
            }
        }
Esempio n. 5
0
            public IList <CoreLabel> Apply(string @in)
            {
                IList <CoreLabel> tokenList;
                string            lineDomain = string.Empty;

                if (this._enclosing.inputHasDomainLabels)
                {
                    string[] domainAndData = @in.Split("\\s+", 2);
                    if (domainAndData.Length < 2)
                    {
                        ArabicDocumentReaderAndWriter.log.Info("Missing domain label or text: ");
                        ArabicDocumentReaderAndWriter.log.Info(@in);
                    }
                    else
                    {
                        lineDomain = domainAndData[0];
                        @in        = domainAndData[1];
                    }
                }
                else
                {
                    lineDomain = this._enclosing.inputDomain;
                }
                if (this._enclosing.inputHasTags)
                {
                    string[]          toks     = @in.Split("\\s+");
                    IList <CoreLabel> input    = new List <CoreLabel>(toks.Length);
                    string            tagDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.tagDelimiter);
                    string            rewDelim = Pattern.Quote(ArabicDocumentReaderAndWriter.rewriteDelimiter);
                    foreach (string wordTag in toks)
                    {
                        string[] wordTagPair = wordTag.Split(tagDelim);
                        System.Diagnostics.Debug.Assert(wordTagPair.Length == 2);
                        string[] rewritePair = wordTagPair[0].Split(rewDelim);
                        System.Diagnostics.Debug.Assert(rewritePair.Length == 1 || rewritePair.Length == 2);
                        string raw       = rewritePair[0];
                        string rewritten = raw;
                        if (rewritePair.Length == 2)
                        {
                            rewritten = rewritePair[1];
                        }
                        CoreLabel cl = new CoreLabel();
                        if (this._enclosing.tf != null)
                        {
                            IList <CoreLabel> lexListRaw       = this._enclosing.tf.GetTokenizer(new StringReader(raw)).Tokenize();
                            IList <CoreLabel> lexListRewritten = this._enclosing.tf.GetTokenizer(new StringReader(rewritten)).Tokenize();
                            if (lexListRewritten.Count != lexListRaw.Count)
                            {
                                System.Console.Error.Printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.GetType().FullName, raw, rewritten);
                                lexListRewritten = lexListRaw;
                            }
                            if (lexListRaw.IsEmpty())
                            {
                                continue;
                            }
                            else
                            {
                                if (lexListRaw.Count == 1)
                                {
                                    raw       = lexListRaw[0].Value();
                                    rewritten = lexListRewritten[0].Value();
                                }
                                else
                                {
                                    if (lexListRaw.Count > 1)
                                    {
                                        string secondWord = lexListRaw[1].Value();
                                        if (secondWord.Equals(this._enclosing.segMarker.ToString()))
                                        {
                                            // Special case for the null marker in the vocalized section
                                            raw       = lexListRaw[0].Value() + this._enclosing.segMarker;
                                            rewritten = lexListRewritten[0].Value() + this._enclosing.segMarker;
                                        }
                                        else
                                        {
                                            System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", this.GetType().FullName, raw);
                                            raw       = lexListRaw[0].Value();
                                            rewritten = lexListRewritten[0].Value();
                                        }
                                    }
                                }
                            }
                        }
                        cl.SetValue(raw);
                        cl.SetWord(raw);
                        cl.SetTag(wordTagPair[1]);
                        cl.Set(typeof(CoreAnnotations.DomainAnnotation), lineDomain);
                        cl.Set(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation), rewritten);
                        input.Add(cl);
                    }
                    tokenList = IOBUtils.StringToIOB(input, this._enclosing.segMarker, true, this._enclosing.shouldStripRewrites);
                }
                else
                {
                    if (this._enclosing.tf == null)
                    {
                        tokenList = IOBUtils.StringToIOB(@in, this._enclosing.segMarker);
                    }
                    else
                    {
                        IList <CoreLabel> line = this._enclosing.tf.GetTokenizer(new StringReader(@in)).Tokenize();
                        tokenList = IOBUtils.StringToIOB(line, this._enclosing.segMarker, false);
                    }
                }
                if (this._enclosing.inputHasDomainLabels && !this._enclosing.inputHasTags)
                {
                    IOBUtils.LabelDomain(tokenList, lineDomain);
                }
                else
                {
                    if (!this._enclosing.inputHasDomainLabels)
                    {
                        IOBUtils.LabelDomain(tokenList, this._enclosing.inputDomain);
                    }
                }
                return(tokenList);
            }