Exemplo n.º 1
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private static void DemonstrateSerializationColumnDataClassifier()
        {
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("Demonstrating working with a serialized classifier using serializeTo");
            ColumnDataClassifier cdc = new ColumnDataClassifier(where + "examples/cheese2007.prop");

            cdc.TrainClassifier(where + "examples/cheeseDisease.train");
            // Exhibit serialization and deserialization working. Serialized to bytes in memory for simplicity
            System.Console.Out.WriteLine();
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            ObjectOutputStream    oos  = new ObjectOutputStream(baos);

            cdc.SerializeClassifier(oos);
            oos.Close();
            byte[] @object            = baos.ToByteArray();
            ByteArrayInputStream bais = new ByteArrayInputStream(@object);
            ObjectInputStream    ois  = new ObjectInputStream(bais);
            ColumnDataClassifier cdc2 = ColumnDataClassifier.GetClassifier(ois);

            ois.Close();
            // We compare the output of the deserialized classifier cdc2 versus the original one cl
            // For both we use a ColumnDataClassifier to convert text lines to examples
            System.Console.Out.WriteLine("Making predictions with both classifiers");
            foreach (string line in ObjectBank.GetLineIterator(where + "examples/cheeseDisease.test", "utf-8"))
            {
                IDatum <string, string> d  = cdc.MakeDatumFromLine(line);
                IDatum <string, string> d2 = cdc2.MakeDatumFromLine(line);
                System.Console.Out.Printf("%s  =origi=>  %s (%.4f)%n", line, cdc.ClassOf(d), cdc.ScoresOf(d).GetCount(cdc.ClassOf(d)));
                System.Console.Out.Printf("%s  =deser=>  %s (%.4f)%n", line, cdc2.ClassOf(d2), cdc2.ScoresOf(d).GetCount(cdc2.ClassOf(d)));
            }
        }
 protected internal override ICollection <IList <IN> > LoadAuxiliaryData(ICollection <IList <IN> > docs, IDocumentReaderAndWriter <IN> readerAndWriter)
 {
     if (flags.unsupDropoutFile != null)
     {
         log.Info("Reading unsupervised dropout data from file: " + flags.unsupDropoutFile);
         Timing timer = new Timing();
         timer.Start();
         unsupDocs = new List <IList <IN> >();
         ObjectBank <IList <IN> > unsupObjBank = MakeObjectBankFromFile(flags.unsupDropoutFile, readerAndWriter);
         foreach (IList <IN> doc in unsupObjBank)
         {
             foreach (IN tok in doc)
             {
                 tok.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol);
                 tok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), flags.backgroundSymbol);
             }
             unsupDocs.Add(doc);
         }
         long elapsedMs = timer.Stop();
         log.Info("Time to read: : " + Timing.ToSecondsString(elapsedMs) + " seconds");
     }
     if (unsupDocs != null && flags.doFeatureDiscovery)
     {
         IList <IList <IN> > totalDocs = new List <IList <IN> >();
         Sharpen.Collections.AddAll(totalDocs, docs);
         Sharpen.Collections.AddAll(totalDocs, unsupDocs);
         return(totalDocs);
     }
     else
     {
         return(docs);
     }
 }
Exemplo n.º 3
0
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0)
            {
                where = args[0] + File.separator;
            }
            System.Console.Out.WriteLine("Training ColumnDataClassifier");
            ColumnDataClassifier cdc = new ColumnDataClassifier(where + "examples/cheese2007.prop");

            cdc.TrainClassifier(where + "examples/cheeseDisease.train");
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("Testing predictions of ColumnDataClassifier");
            foreach (string line in ObjectBank.GetLineIterator(where + "examples/cheeseDisease.test", "utf-8"))
            {
                // instead of the method in the line below, if you have the individual elements
                // already you can use cdc.makeDatumFromStrings(String[])
                IDatum <string, string> d = cdc.MakeDatumFromLine(line);
                System.Console.Out.Printf("%s  ==>  %s (%.4f)%n", line, cdc.ClassOf(d), cdc.ScoresOf(d).GetCount(cdc.ClassOf(d)));
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("Testing accuracy of ColumnDataClassifier");
            Pair <double, double> performance = cdc.TestClassifier(where + "examples/cheeseDisease.test");

            System.Console.Out.Printf("Accuracy: %.3f; macro-F1: %.3f%n", performance.First(), performance.Second());
            DemonstrateSerialization();
            DemonstrateSerializationColumnDataClassifier();
        }
        // end class CoNLLIterator
        private static IEnumerator <string> SplitIntoDocs(Reader r)
        {
            ICollection <string> docs    = new List <string>();
            ObjectBank <string>  ob      = ObjectBank.GetLineIterator(r);
            StringBuilder        current = new StringBuilder();
            Matcher matcher = docPattern.Matcher(string.Empty);

            foreach (string line in ob)
            {
                if (matcher.Reset(line).LookingAt())
                {
                    // Start new doc, store old one if non-empty
                    if (current.Length > 0)
                    {
                        docs.Add(current.ToString());
                        current.Length = 0;
                    }
                }
                current.Append(line).Append('\n');
            }
            if (current.Length > 0)
            {
                docs.Add(current.ToString());
            }
            return(docs.GetEnumerator());
        }
Exemplo n.º 5
0
        /// <summary>This runs a simple train and test regime.</summary>
        /// <remarks>
        /// This runs a simple train and test regime.
        /// The data file format is one item per line, space separated, with first the class label
        /// and then a bunch of (categorical) string features.
        /// </remarks>
        /// <param name="args">The arguments/flags are: -trainFile trainFile -testFile testFile [-l1reg num] [-biased]</param>
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            Properties prop             = StringUtils.ArgsToProperties(args);
            double     l1reg            = double.ParseDouble(prop.GetProperty("l1reg", "0.0"));
            Dataset <string, string> ds = new Dataset <string, string>();

            foreach (string line in ObjectBank.GetLineIterator(new File(prop.GetProperty("trainFile"))))
            {
                string[]             bits = line.Split("\\s+");
                ICollection <string> f    = new LinkedList <string>(Arrays.AsList(bits).SubList(1, bits.Length));
                string l = bits[0];
                ds.Add(f, l);
            }
            ds.SummaryStatistics();
            bool biased = prop.GetProperty("biased", "false").Equals("true");
            LogisticClassifierFactory <string, string> factory = new LogisticClassifierFactory <string, string>();

            Edu.Stanford.Nlp.Classify.LogisticClassifier <string, string> lc = factory.TrainClassifier(ds, l1reg, 1e-4, biased);
            foreach (string line_1 in ObjectBank.GetLineIterator(new File(prop.GetProperty("testFile"))))
            {
                string[]             bits = line_1.Split("\\s+");
                ICollection <string> f    = new LinkedList <string>(Arrays.AsList(bits).SubList(1, bits.Length));
                //String l = bits[0];
                string g    = lc.ClassOf(f);
                double prob = lc.ProbabilityOf(f, g);
                System.Console.Out.Printf("%4.3f\t%s\t%s%n", prob, g, line_1);
            }
        }
Exemplo n.º 6
0
        private static IDictionary <string, string> LoadMixedCaseMap(string mapFile)
        {
            IDictionary <string, string> map = Generics.NewHashMap();

            try
            {
                using (BufferedReader br = IOUtils.ReaderFromString(mapFile))
                {
                    foreach (string line in ObjectBank.GetLineIterator(br))
                    {
                        line = line.Trim();
                        string[] els = line.Split("\\s+");
                        if (els.Length != 2)
                        {
                            throw new Exception("Wrong format: " + mapFile);
                        }
                        map[els[0]] = els[1];
                    }
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            return(map);
        }
Exemplo n.º 7
0
 /// <param name="normalizationTableFile">
 /// A file listing character pairs for
 /// normalization.  Currently the normalization table must be in UTF-8.
 /// If this parameter is
 /// <see langword="null"/>
 /// , the default normalization
 /// of the zero-argument constructor is used.
 /// </param>
 public ChineseDocumentToSentenceProcessor(string normalizationTableFile)
 {
     // todo: This class is a mess. We should try to get it out of core
     // not \uff0e . (too often separates English first/last name, etc.)
     // private final String normalizationTableFile;
     // this.normalizationTableFile = normalizationTableFile;
     if (normalizationTableFile != null)
     {
         normalizationTable = new List <Pair <string, string> >();
         foreach (string line in ObjectBank.GetLineIterator(new File(normalizationTableFile), encoding))
         {
             Matcher pairMatcher = PairPattern.Matcher(line);
             if (pairMatcher.Find())
             {
                 normalizationTable.Add(new Pair <string, string>(pairMatcher.Group(1), pairMatcher.Group(2)));
             }
             else
             {
                 log.Info("Didn't match: " + line);
             }
         }
     }
     else
     {
         normalizationTable = null;
     }
 }
 public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank <IList <In> > wrapped, ICollection <string> knownLCWords)
     : base(null, null)
 {
     this.flags        = flags;
     this.wrapped      = wrapped;
     this.knownLCWords = knownLCWords;
 }
Exemplo n.º 9
0
        // static demo class
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]");
                return;
            }
            string      rules = args[0];
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));

            pipeline.Annotate(annotation);
            // Load lines of file as TokenSequencePatterns
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            foreach (string line in ObjectBank.GetLineIterator(rules))
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                @out.Println("Sentence #" + ++i);
                @out.Print("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    @out.Print(' ');
                    @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                @out.Println();
                MultiPatternMatcher <ICoreMap>           multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
                IList <ISequenceMatchResult <ICoreMap> > answers      = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    @out.Println("  Match #" + ++j);
                    for (int k = 0; k <= matched.GroupCount(); k++)
                    {
                        @out.Println("    group " + k + " = " + matched.Group(k));
                    }
                }
            }
            @out.Flush();
        }
Exemplo n.º 10
0
 void Start()
 {
     generator  = GetComponent <MazeConstructor>();
     objectBank = FindObjectOfType <ObjectBank>();
     trees      = new GameObject("Trees");
     arbors     = new GameObject("Arbors");
     banks      = new GameObject("Banks");
     StartNewGame();
 }
Exemplo n.º 11
0
        /// <summary>Train a segmenter from raw text.</summary>
        /// <remarks>Train a segmenter from raw text. Gold segmentation markers are required.</remarks>
        public virtual void Train()
        {
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, noRewrites, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.trainFile, docReader);

            classifier.Train(lines, docReader);
            log.Info("Finished training.");
        }
        /// <summary>
        /// The Extractor argument extraction keeps ; together, so we use
        /// that to delimit options.
        /// </summary>
        /// <remarks>
        /// The Extractor argument extraction keeps ; together, so we use
        /// that to delimit options.  Actually, the only option supported is
        /// mapdigits, which tells the Distsim to try mapping [0-9] to 0 and
        /// requery for an unknown word with digits.
        /// </remarks>
        public Distsim(string path)
        {
            // Avoid loading the same lexicon twice but allow different lexicons
            // TODO: when loading a distsim, should we populate this map?
            // = false
            // = false;
            string[] pieces   = path.Split(";");
            string   filename = pieces[0];

            for (int arg = 1; arg < pieces.Length; ++arg)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(pieces[arg], "mapdigits"))
                {
                    mapdigits = true;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(pieces[arg], "casedDistSim"))
                    {
                        casedDistSim = true;
                    }
                    else
                    {
                        throw new ArgumentException("Unknown argument " + pieces[arg]);
                    }
                }
            }
            lexicon = Generics.NewHashMap();
            // todo [cdm 2016]: Note that this loads file with default file encoding rather than specifying it
            foreach (string word in ObjectBank.GetLineIterator(new File(filename)))
            {
                string[] bits = word.Split("\\s+");
                string   w    = bits[0];
                if (!casedDistSim)
                {
                    w = w.ToLower();
                }
                lexicon[w] = bits[1];
            }
            if (lexicon.Contains("<unk>"))
            {
                unk = lexicon["<unk>"];
            }
            else
            {
                unk = "null";
            }
        }
Exemplo n.º 13
0
 private void ReadSRLFile(string srlFile)
 {
     srlMap = Generics.NewHashMap();
     foreach (string line in ObjectBank.GetLineIterator(new File(srlFile)))
     {
         string[] bits     = line.Split("\\s+", 3);
         string   filename = bits[0];
         int      treeNum  = System.Convert.ToInt32(bits[1]);
         string   info     = bits[2];
         CollectionValuedMap <int, string> cvm = srlMap[filename];
         if (cvm == null)
         {
             cvm = new CollectionValuedMap <int, string>();
             srlMap[filename] = cvm;
         }
         cvm.Add(treeNum, info);
     }
 }
Exemplo n.º 14
0
 /// <summary>Read the data as a list of RVFDatum objects.</summary>
 /// <remarks>Read the data as a list of RVFDatum objects. For the test set we must reuse the indices from the training set</remarks>
 internal static List <RVFDatum <string, int> > ReadData(string filename, IDictionary <int, IIndex <string> > indices)
 {
     try
     {
         string sep = ", ";
         List <RVFDatum <string, int> > examples = new List <RVFDatum <string, int> >();
         foreach (string line in ObjectBank.GetLineIterator(new File(filename)))
         {
             RVFDatum <string, int> next = ReadDatum(line, sep, indices);
             examples.Add(next);
         }
         return(examples);
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     return(null);
 }
        public virtual void TestUsingIterator()
        {
            string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n";

            string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." };
            string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." };
            NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length");
            Properties         props = PropertiesUtils.AsProperties("wordShape", "chris2");
            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>();

            readerAndWriter.Init(flags);
            ReaderIteratorFactory           rif          = new ReaderIteratorFactory(new StringReader(s));
            ObjectBank <IList <CoreLabel> > di           = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter);
            ICollection <string>            knownLCWords = new HashSet <string>();
            ObjectBankWrapper <CoreLabel>   obw          = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords);

            try
            {
                int outIdx = 0;
                for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();)
                {
                    IList <CoreLabel> sent = iter.Current;
                    for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();)
                    {
                        CoreLabel cl    = iter2.Current;
                        string    tok   = cl.Word();
                        string    shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation));
                        NUnit.Framework.Assert.AreEqual(output[outIdx], tok);
                        NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape);
                        outIdx++;
                    }
                }
                if (outIdx < output.Length)
                {
                    NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]);
                }
            }
            catch (Exception e)
            {
                NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e);
            }
        }
        public DistSimClassifier(string filename, string format, string encoding, int distSimMaxBits, bool cased, bool numberEquivalence, string unknownWordClass)
        {
            this.cased             = cased;
            this.numberEquivalence = numberEquivalence;
            this.unknownWordClass  = unknownWordClass;
            Timing.StartDoing("Loading distsim lexicon from " + filename);
            lexicon = Generics.NewHashMap(1 << 15);
            // make a reasonable starting size
            bool terryKoo = "terryKoo".Equals(format);

            foreach (string line in ObjectBank.GetLineIterator(filename, encoding))
            {
                string word;
                string wordClass;
                if (terryKoo)
                {
                    string[] bits = line.Split("\\t");
                    word      = bits[1];
                    wordClass = bits[0];
                    if (distSimMaxBits > 0 && wordClass.Length > distSimMaxBits)
                    {
                        wordClass = Sharpen.Runtime.Substring(wordClass, 0, distSimMaxBits);
                    }
                }
                else
                {
                    // "alexClark"
                    string[] bits = line.Split("\\s+");
                    word      = bits[0];
                    wordClass = bits[1];
                }
                if (!cased)
                {
                    word = word.ToLower();
                }
                if (numberEquivalence)
                {
                    word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits);
                }
                lexicon[word] = wordClass;
            }
            Timing.EndDoing();
        }
 /// <summary>Constructs a Dataset by reading in a file in SVM light format.</summary>
 /// <remarks>
 /// Constructs a Dataset by reading in a file in SVM light format.
 /// the created dataset has the same feature and label index as given
 /// </remarks>
 public static Edu.Stanford.Nlp.Classify.Dataset <string, string> ReadSVMLightFormat(string filename, IIndex <string> featureIndex, IIndex <string> labelIndex, IList <string> lines)
 {
     Edu.Stanford.Nlp.Classify.Dataset <string, string> dataset;
     try
     {
         dataset = new Edu.Stanford.Nlp.Classify.Dataset <string, string>(10, featureIndex, labelIndex);
         foreach (string line in ObjectBank.GetLineIterator(new File(filename)))
         {
             if (lines != null)
             {
                 lines.Add(line);
             }
             dataset.Add(SvmLightLineToDatum(line));
         }
     }
     catch (Exception e)
     {
         throw new Exception(e);
     }
     return(dataset);
 }
Exemplo n.º 18
0
 /// <summary>reads scores with classes from a file, sorts by score and creates the arrays</summary>
 public PRCurve(string filename)
 {
     //sorted scores
     // the class of example i
     // the guess of example i according to the argmax
     // number positive in the i-th highest scores
     // number negative in the i-th lowest scores
     try
     {
         List <Pair <double, int> > dataScores = new List <Pair <double, int> >();
         foreach (string line in ObjectBank.GetLineIterator(new File(filename)))
         {
             IList <string>     elems = StringUtils.Split(line);
             Pair <double, int> p     = new Pair <double, int>(double.ValueOf(elems[0]), int.Parse(elems[1]));
             dataScores.Add(p);
         }
         Init(dataScores);
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Exemplo n.º 19
0
 /// <summary>reads scores with classes from a file, sorts by score and creates the arrays</summary>
 public PRCurve(string filename, bool svm)
 {
     try
     {
         List <Pair <double, int> > dataScores = new List <Pair <double, int> >();
         foreach (string line in ObjectBank.GetLineIterator(new File(filename)))
         {
             IList <string> elems = StringUtils.Split(line);
             int            cls   = double.ValueOf(elems[0]);
             if (cls == -1)
             {
                 cls = 0;
             }
             double             score = double.ValueOf(elems[1]) + 0.5;
             Pair <double, int> p     = new Pair <double, int>(score, int.Parse(cls));
             dataScores.Add(p);
         }
         Init(dataScores);
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Exemplo n.º 20
0
        /// <summary>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses.
        /// </summary>
        /// <remarks>
        /// Evaluate accuracy when the input is gold segmented text *with* segmentation
        /// markers and morphological analyses. In other words, the evaluation file has the
        /// same format as the training data.
        /// </remarks>
        /// <param name="pwOut"/>
        private void Evaluate(PrintWriter pwOut)
        {
            log.Info("Starting evaluation...");
            bool hasSegmentationMarkers = true;
            bool hasTags = true;
            IDocumentReaderAndWriter <CoreLabel> docReader = new ArabicDocumentReaderAndWriter(hasSegmentationMarkers, hasTags, hasDomainLabels, domain, tf);
            ObjectBank <IList <CoreLabel> >      lines     = classifier.MakeObjectBankFromFile(flags.testFile, docReader);
            PrintWriter tedEvalGoldTree  = null;
            PrintWriter tedEvalParseTree = null;
            PrintWriter tedEvalGoldSeg   = null;
            PrintWriter tedEvalParseSeg  = null;

            if (tedEvalPrefix != null)
            {
                try
                {
                    tedEvalGoldTree  = new PrintWriter(tedEvalPrefix + "_gold.ftree");
                    tedEvalGoldSeg   = new PrintWriter(tedEvalPrefix + "_gold.segmentation");
                    tedEvalParseTree = new PrintWriter(tedEvalPrefix + "_parse.ftree");
                    tedEvalParseSeg  = new PrintWriter(tedEvalPrefix + "_parse.segmentation");
                }
                catch (FileNotFoundException e)
                {
                    System.Console.Error.Printf("%s: %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, e.Message);
                }
            }
            ICounter <string> labelTotal   = new ClassicCounter <string>();
            ICounter <string> labelCorrect = new ClassicCounter <string>();
            int total   = 0;
            int correct = 0;

            foreach (IList <CoreLabel> line in lines)
            {
                string[] inputTokens = TedEvalSanitize(IOBUtils.IOBToString(line).ReplaceAll(":", "#pm#")).Split(" ");
                string[] goldTokens  = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                line = classifier.Classify(line);
                string[] parseTokens = TedEvalSanitize(IOBUtils.IOBToString(line, ":")).Split(" ");
                foreach (CoreLabel label in line)
                {
                    // Do not evaluate labeling of whitespace
                    string observation = label.Get(typeof(CoreAnnotations.CharAnnotation));
                    if (!observation.Equals(IOBUtils.GetBoundaryCharacter()))
                    {
                        total++;
                        string hypothesis = label.Get(typeof(CoreAnnotations.AnswerAnnotation));
                        string reference  = label.Get(typeof(CoreAnnotations.GoldAnswerAnnotation));
                        labelTotal.IncrementCount(reference);
                        if (hypothesis.Equals(reference))
                        {
                            correct++;
                            labelCorrect.IncrementCount(reference);
                        }
                    }
                }
                if (tedEvalParseSeg != null)
                {
                    tedEvalGoldTree.Printf("(root");
                    tedEvalParseTree.Printf("(root");
                    int safeLength = inputTokens.Length;
                    if (inputTokens.Length != goldTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and gold do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  gold: " + Arrays.ToString(goldTokens));
                        safeLength = Math.Min(inputTokens.Length, goldTokens.Length);
                    }
                    if (inputTokens.Length != parseTokens.Length)
                    {
                        log.Info("In generating TEDEval files: Input and parse do not have the same number of tokens");
                        log.Info("    (ignoring any extras)");
                        log.Info("  input: " + Arrays.ToString(inputTokens));
                        log.Info("  parse: " + Arrays.ToString(parseTokens));
                        safeLength = Math.Min(inputTokens.Length, parseTokens.Length);
                    }
                    for (int i = 0; i < safeLength; i++)
                    {
                        foreach (string segment in goldTokens[i].Split(":"))
                        {
                            tedEvalGoldTree.Printf(" (seg %s)", segment);
                        }
                        tedEvalGoldSeg.Printf("%s\t%s%n", inputTokens[i], goldTokens[i]);
                        foreach (string segment_1 in parseTokens[i].Split(":"))
                        {
                            tedEvalParseTree.Printf(" (seg %s)", segment_1);
                        }
                        tedEvalParseSeg.Printf("%s\t%s%n", inputTokens[i], parseTokens[i]);
                    }
                    tedEvalGoldTree.Printf(")%n");
                    tedEvalGoldSeg.Println();
                    tedEvalParseTree.Printf(")%n");
                    tedEvalParseSeg.Println();
                }
            }
            double accuracy = ((double)correct) / ((double)total);

            accuracy *= 100.0;
            pwOut.Println("EVALUATION RESULTS");
            pwOut.Printf("#datums:\t%d%n", total);
            pwOut.Printf("#correct:\t%d%n", correct);
            pwOut.Printf("accuracy:\t%.2f%n", accuracy);
            pwOut.Println("==================");
            // Output the per label accuracies
            pwOut.Println("PER LABEL ACCURACIES");
            foreach (string refLabel in labelTotal.KeySet())
            {
                double nTotal   = labelTotal.GetCount(refLabel);
                double nCorrect = labelCorrect.GetCount(refLabel);
                double acc      = (nCorrect / nTotal) * 100.0;
                pwOut.Printf(" %s\t%.2f%n", refLabel, acc);
            }
            if (tedEvalParseSeg != null)
            {
                tedEvalGoldTree.Close();
                tedEvalGoldSeg.Close();
                tedEvalParseTree.Close();
                tedEvalParseSeg.Close();
            }
        }
Exemplo n.º 21
0
            public virtual SemanticGraph Apply(string line)
            {
                if (line == null)
                {
                    return(null);
                }
                IFunction <string, IndexedWord> func     = new CoNLLUDocumentReader.WordProcessor();
                ObjectBank <IndexedWord>        words    = ObjectBank.GetLineIterator(new StringReader(line), func);
                IList <IndexedWord>             wordList = new List <IndexedWord>(words);
                IList <IndexedWord>             sorted   = new List <IndexedWord>(wordList.Count);
                IList <string> comments = new LinkedList <string>();

                /* Increase the line number in case there are comments before the actual sentence
                 * and add them to the list of comments. */
                wordList.Stream().Filter(null).ForEach(null);
                wordList.Stream().Filter(null).Sorted(byIndex.ThenComparing(byType)).ForEach(null);
                IList <IndexedWord> sortedTokens = new List <IndexedWord>(wordList.Count);

                sorted.Stream().Filter(null).Filter(null).ForEach(null);
                sorted.Stream().Filter(null).Filter(null).ForEach(null);
                /* Construct a semantic graph. */
                IList <TypedDependency> deps = new List <TypedDependency>(sorted.Count);
                IntPair tokenSpan            = null;
                string  originalToken        = null;

                foreach (IndexedWord word in sorted)
                {
                    lineNumberCounter++;
                    if (word.ContainsKey(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation)))
                    {
                        tokenSpan     = word.Get(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation));
                        originalToken = word.Word();
                    }
                    else
                    {
                        /* Deal with multiword tokens. */
                        if (tokenSpan != null && tokenSpan.GetTarget() >= word.Index())
                        {
                            word.SetOriginalText(originalToken);
                            word.Set(typeof(CoreAnnotations.CoNLLUTokenSpanAnnotation), tokenSpan);
                        }
                        else
                        {
                            tokenSpan     = null;
                            originalToken = null;
                        }
                        Dictionary <string, string> extraDeps = word.Get(typeof(CoreAnnotations.CoNLLUSecondaryDepsAnnotation));
                        if (extraDeps.IsEmpty())
                        {
                            int govIdx = word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation));
                            Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(govIdx, 0, word, word.Get(typeof(CoreAnnotations.CoNLLDepTypeAnnotation)), sortedTokens);
                            IndexedWord         gov  = govReln.First();
                            GrammaticalRelation reln = govReln.Second();
                            TypedDependency     dep  = new TypedDependency(reln, gov, word);
                            word.Set(typeof(CoreAnnotations.LineNumberAnnotation), lineNumberCounter);
                            deps.Add(dep);
                        }
                        else
                        {
                            foreach (string extraGovIdxStr in extraDeps.Keys)
                            {
                                if (extraGovIdxStr.Contains("."))
                                {
                                    string[] indexParts  = extraGovIdxStr.Split("\\.");
                                    int      extraGovIdx = System.Convert.ToInt32(indexParts[0]);
                                    int      copyCount   = System.Convert.ToInt32(indexParts[1]);
                                    Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(extraGovIdx, copyCount, word, extraDeps[extraGovIdxStr], sortedTokens);
                                    IndexedWord         gov  = govReln.First();
                                    GrammaticalRelation reln = govReln.Second();
                                    TypedDependency     dep  = new TypedDependency(reln, gov, word);
                                    dep.SetExtra();
                                    deps.Add(dep);
                                }
                                else
                                {
                                    int extraGovIdx = System.Convert.ToInt32(extraGovIdxStr);
                                    int mainGovIdx  = word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation)) != null?word.Get(typeof(CoreAnnotations.CoNLLDepParentIndexAnnotation)) : -1;

                                    Pair <IndexedWord, GrammaticalRelation> govReln = GetGovAndReln(extraGovIdx, 0, word, extraDeps[extraGovIdxStr], sortedTokens);
                                    IndexedWord         gov  = govReln.First();
                                    GrammaticalRelation reln = govReln.Second();
                                    TypedDependency     dep  = new TypedDependency(reln, gov, word);
                                    if (extraGovIdx != mainGovIdx)
                                    {
                                        dep.SetExtra();
                                    }
                                    deps.Add(dep);
                                }
                            }
                        }
                    }
                }
                lineNumberCounter++;
                SemanticGraph sg = new SemanticGraph(deps);

                comments.ForEach(null);
                return(sg);
            }