コード例 #1
0
        public virtual void Build()
        {
            LineNumberReader infile        = null;
            PrintWriter      outfile       = null;
            string           currentInfile = string.Empty;

            try
            {
                outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
                foreach (File path in pathsToData)
                {
                    infile        = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")));
                    currentInfile = path.GetPath();
                    while (infile.Ready())
                    {
                        List <Word> sent = SentenceUtils.ToUntaggedList(infile.ReadLine().Split("\\s+"));
                        foreach (Word token in sent)
                        {
                            Matcher hasArabic = utf8ArabicChart.Matcher(token.Word());
                            if (hasArabic.Find())
                            {
                                token.SetWord(escaper.Apply(token.Word()));
                                token.SetWord(lexMapper.Map(null, token.Word()));
                            }
                        }
                        outfile.Println(SentenceUtils.ListToString(sent));
                    }
                    toStringBuffer.Append(string.Format(" Read %d input lines from %s", infile.GetLineNumber(), path.GetPath()));
                }
                infile.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                System.Console.Error.Printf("%s: Filesystem does not support UTF-8 output\n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException)
            {
                System.Console.Error.Printf("%s: Could not open %s for writing\n", this.GetType().FullName, outFileName);
            }
            catch (IOException)
            {
                System.Console.Error.Printf("%s: Error reading from %s (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber());
            }
            catch (Exception e)
            {
                System.Console.Error.Printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber());
                Sharpen.Runtime.PrintStackTrace(e);
            }
            finally
            {
                if (outfile != null)
                {
                    outfile.Close();
                }
            }
        }
コード例 #2
0
        /// <summary>Tags the sentence s by running maxent model.</summary>
        /// <remarks>
        /// Tags the sentence s by running maxent model.  Returns a sentence (List) of
        /// TaggedWord objects.
        /// </remarks>
        /// <param name="s">Input sentence (List).  This isn't changed.</param>
        /// <returns>Tagged sentence</returns>
        public virtual List <TaggedWord> TagSentence <_T0>(IList <_T0> s, bool reuseTags)
            where _T0 : IHasWord
        {
            this.origWords = new List <IHasWord>(s);
            int sz = s.Count;

            this.sent = new List <string>(sz + 1);
            foreach (IHasWord value1 in s)
            {
                if (maxentTagger.wordFunction != null)
                {
                    sent.Add(maxentTagger.wordFunction.Apply(value1.Word()));
                }
                else
                {
                    sent.Add(value1.Word());
                }
            }
            sent.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord);
            if (reuseTags)
            {
                this.originalTags = new List <string>(sz + 1);
                foreach (IHasWord value in s)
                {
                    if (value is IHasTag)
                    {
                        originalTags.Add(((IHasTag)value).Tag());
                    }
                    else
                    {
                        originalTags.Add(null);
                    }
                }
                originalTags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag);
            }
            size = sz + 1;
            if (Verbose)
            {
                log.Info("Sentence is " + SentenceUtils.ListToString(sent, false, tagSeparator));
            }
            Init();
            List <TaggedWord> result = TestTagInference();

            if (maxentTagger.wordFunction != null)
            {
                for (int j = 0; j < sz; ++j)
                {
                    result[j].SetWord(s[j].Word());
                }
            }
            return(result);
        }
コード例 #3
0
        /// <summary>Reads an annotation from the given filename using the requested input.</summary>
        public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown)
        {
            switch (inputFormat)
            {
            case SentimentPipeline.Input.Text:
            {
                string     text       = IOUtils.SlurpFileNoExceptions(filename);
                Annotation annotation = new Annotation(text);
                tokenizer.Annotate(annotation);
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                    nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
                    annotations.Add(nextAnnotation);
                }
                return(annotations);
            }

            case SentimentPipeline.Input.Trees:
            {
                IList <Tree> trees;
                if (filterUnknown)
                {
                    trees = SentimentUtils.ReadTreesWithGoldLabels(filename);
                    trees = SentimentUtils.FilterUnknownRoots(trees);
                }
                else
                {
                    MemoryTreebank treebank = new MemoryTreebank("utf-8");
                    treebank.LoadPath(filename, null);
                    trees = new List <Tree>(treebank);
                }
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (Tree tree in trees)
                {
                    ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield()));
                    sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                    IList <ICoreMap> sentences  = Java.Util.Collections.SingletonList(sentence);
                    Annotation       annotation = new Annotation(string.Empty);
                    annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
                    annotations.Add(annotation);
                }
                return(annotations);
            }

            default:
            {
                throw new ArgumentException("Unknown format " + inputFormat);
            }
            }
        }
コード例 #4
0
        /// <summary>Converts a parse tree into a string of tokens.</summary>
        /// <remarks>
        /// Converts a parse tree into a string of tokens. Each token is a word and
        /// its POS tag separated by the delimiter specified by <code>separator</code>
        /// </remarks>
        /// <param name="t">- A parse tree</param>
        /// <param name="removeEscaping">- If true, remove LDC escape characters. Otherwise, leave them.</param>
        /// <param name="separator">Word/tag separator</param>
        /// <returns>A string of tagged words</returns>
        public static string TaggedStringFromTree(Tree t, bool removeEscaping, string separator)
        {
            t = t.Prune(emptyFilter, tf);
            IList <CoreLabel> taggedSentence = t.TaggedLabeledYield();

            foreach (CoreLabel token in taggedSentence)
            {
                string word = (removeEscaping) ? UnEscape(token.Word()) : token.Word();
                token.SetWord(word);
                token.SetValue(word);
            }
            return(SentenceUtils.ListToString(taggedSentence, false, separator));
        }
コード例 #5
0
        private static void CompareXMLResults(string input, string element, params string[] expectedResults)
        {
            List <string>        results  = new List <string>();
            DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)), DocumentPreprocessor.DocType.Xml);

            document.SetElementDelimiter(element);
            foreach (IList <IHasWord> sentence in document)
            {
                results.Add(SentenceUtils.ListToString(sentence));
            }
            NUnit.Framework.Assert.AreEqual(expectedResults.Length, results.Count);
            for (int i = 0; i < results.Count; ++i)
            {
                NUnit.Framework.Assert.AreEqual(expectedResults[i], results[i]);
            }
        }
コード例 #6
0
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            IList <IList <IHasWord> > sentences = MaxentTagger.TokenizeText(new BufferedReader(new FileReader(args[1])));

            foreach (IList <IHasWord> sentence in sentences)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                System.Console.Out.WriteLine(SentenceUtils.ListToString(tSentence, false));
            }
        }
コード例 #7
0
        private Tree PostProcessMWE(Tree t)
        {
            string tYield = SentenceUtils.ListToString(t.Yield()).ReplaceAll("\\s+", string.Empty);

            if (tYield.Matches("[\\d\\p{Punct}]*"))
            {
                IList <Tree> kids = new List <Tree>();
                kids.Add(treeFactory.NewLeaf(tYield));
                t = treeFactory.NewTreeNode(t.Value(), kids);
            }
            else
            {
                t.SetValue(MwePhrasal + t.Value());
            }
            return(t);
        }
コード例 #8
0
        // static methods
        public static void SetLabels(Tree tree, IDictionary <string, string> labelMap, ParseAndSetLabels.MissingLabels missing, string defaultLabel, ICollection <string> unknowns)
        {
            if (tree.IsLeaf())
            {
                return;
            }
            string text  = SentenceUtils.ListToString(tree.Yield());
            string label = labelMap[text];

            if (label != null)
            {
                tree.Label().SetValue(label);
            }
            else
            {
                switch (missing)
                {
                case ParseAndSetLabels.MissingLabels.Fail:
                {
                    throw new Exception("No label for '" + text + "'");
                }

                case ParseAndSetLabels.MissingLabels.Default:
                {
                    tree.Label().SetValue(defaultLabel);
                    unknowns.Add(text);
                    break;
                }

                case ParseAndSetLabels.MissingLabels.KeepOriginal:
                {
                    // do nothing
                    break;
                }

                default:
                {
                    throw new ArgumentException("Unknown MissingLabels mode " + missing);
                }
                }
            }
            foreach (Tree child in tree.Children())
            {
                SetLabels(child, labelMap, missing, defaultLabel, unknowns);
            }
        }
コード例 #9
0
 public override void Evaluate(Tree guess, Tree gold, PrintWriter pw)
 {
     if (gold == null || guess == null)
     {
         System.Console.Error.Printf("%s: Cannot compare against a null gold or guess tree!\n", this.GetType().FullName);
         return;
     }
     else
     {
         if (guess.Yield().Count != gold.Yield().Count)
         {
             log.Info("Warning: yield differs:");
             log.Info("Guess: " + SentenceUtils.ListToString(guess.Yield()));
             log.Info("Gold:  " + SentenceUtils.ListToString(gold.Yield()));
         }
     }
     base.Evaluate(guess, gold, pw);
 }
コード例 #10
0
        public virtual void TestSpanishDatelineSeparation()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex"
                                                            , "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/  /[-\\p{L}]+/+ )? " + "( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            NUnit.Framework.Assert.AreEqual(dateLineSpanishTexts.Length, dateLineSpanishTokens.Length, "Bad test data");
            for (int i = 0; i < dateLineSpanishTexts.Length; i++)
            {
                Annotation document1 = new Annotation(dateLineSpanishTexts[i]);
                pipeline.Annotate(document1);
                IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));
                NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineSpanishTexts[i] + " annotation is " + document1);
                IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation));
                string            sentenceOne       = SentenceUtils.ListToString(sentenceOneTokens);
                NUnit.Framework.Assert.AreEqual(dateLineSpanishTokens[i], sentenceOne, "Bad tokens in dateline");
            }
        }
コード例 #11
0
        public virtual void TestArabicTokenizer()
        {
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            for (int i = 0; i < untokInputs.Length; ++i)
            {
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
            }
        }
コード例 #12
0
        public static void Main(string[] args)
        {
            Properties config = StringUtils.ArgsToProperties(args);

            log.Info(config);
            bool             fullSentence = PropertiesUtils.GetBool(config, "fullSentence", false);
            Random           random       = new Random();
            string           tagSeparator = config.GetProperty("tagSeparator", TaggerConfig.TagSeparator);
            TaggedFileRecord record       = TaggedFileRecord.CreateRecord(config, config.GetProperty("input"));

            foreach (IList <TaggedWord> sentence in record.Reader())
            {
                int len = random.NextInt(sentence.Count) + 1;
                System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence.SubList(0, len), false, tagSeparator));
                if (fullSentence)
                {
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(sentence, false, tagSeparator));
                }
            }
        }
コード例 #13
0
        public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation));
            string            sentenceTwo       = SentenceUtils.ListToString(sentenceTwoTokens);

            NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence");
        }
コード例 #14
0
        public virtual void TestKbpSpanishWorks()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard"
                                                            , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags"
                                                            , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags"
                                                            , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]"
                                                            );
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(kbpSpanishDocument);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i);
            }
            NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences");
        }
コード例 #15
0
        public override string ToString()
        {
            StringBuilder result = new StringBuilder();

            for (int prediction = 0; prediction < numClasses; ++prediction)
            {
                result.Append("Best scores for class " + prediction + "\n");
                IDictionary <int, PriorityQueue <Tree> > ngrams = classToNGrams[prediction];
                foreach (KeyValuePair <int, PriorityQueue <Tree> > entry in ngrams)
                {
                    IList <Tree> trees = Generics.NewArrayList(entry.Value);
                    trees.Sort(ScoreComparator(prediction));
                    result.Append("  Len " + entry.Key + "\n");
                    for (int i = trees.Count - 1; i >= 0; i--)
                    {
                        Tree tree = trees[i];
                        result.Append("    " + SentenceUtils.ListToString(tree.Yield()) + "  [" + RNNCoreAnnotations.GetPredictions(tree).Get(prediction) + "]\n");
                    }
                }
            }
            return(result.ToString());
        }
コード例 #16
0
        public virtual void TestDatelineSeparation()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? "
                                                            + "/\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | " + "/\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            NUnit.Framework.Assert.AreEqual(dateLineTexts.Length, dateLineTokens.Length, "Bad test data");
            for (int i = 0; i < dateLineTexts.Length; i++)
            {
                Annotation document1 = new Annotation(dateLineTexts[i]);
                pipeline.Annotate(document1);
                IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));
                // for (CoreMap sentence : sentences) {
                //   String sentenceText = SentenceUtils.listToString(sentence.get(CoreAnnotations.TokensAnnotation.class));
                //   System.err.println(sentenceText);
                // }
                NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineTexts[i] + " annotation is " + document1);
                IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation));
                string            sentenceOne       = SentenceUtils.ListToString(sentenceOneTokens);
                NUnit.Framework.Assert.AreEqual(dateLineTokens[i], sentenceOne, "Bad tokens in dateline");
            }
        }
コード例 #17
0
        public virtual IList <IHasWord> Segment(string s)
        {
            BuildSegmentationLattice(s);
            List <Word> sent = MaxMatchSegmentation();

            PrintlnErr("raw output: " + SentenceUtils.ListToString(sent));
            List <Word> postProcessedSent = PostProcessSentence(sent);

            PrintlnErr("processed output: " + SentenceUtils.ListToString(postProcessedSent));
            ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor();
            string postSentString = postProcessor.PostProcessingAnswer(postProcessedSent.ToString(), false);

            PrintlnErr("Sighan2005 output: " + postSentString);
            string[]    postSentArray = postSentString.Split("\\s+");
            List <Word> postSent      = new List <Word>();

            foreach (string w in postSentArray)
            {
                postSent.Add(new Word(w));
            }
            return(new List <IHasWord>(postSent));
        }
コード例 #18
0
        public static void CountMWEStatistics(Tree t, TwoDimensionalCounter <string, string> unigramTagger, TwoDimensionalCounter <string, string> labelPreterm, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string>
                                              labelTerm, TwoDimensionalCounter <string, string> termLabel)
        {
            UpdateTagger(unigramTagger, t);
            //Count MWE statistics
            TregexMatcher m = pMWE.Matcher(t);

            while (m.FindNextMatchingNode())
            {
                Tree   match = m.GetMatch();
                string label = match.Value();
                if (ResolveDummyTags && label.Equals(FrenchXMLTreeReader.MissingPhrasal))
                {
                    continue;
                }
                string preterm = SentenceUtils.ListToString(match.PreTerminalYield());
                string term    = SentenceUtils.ListToString(match.Yield());
                labelPreterm.IncrementCount(label, preterm);
                pretermLabel.IncrementCount(preterm, label);
                labelTerm.IncrementCount(label, term);
                termLabel.IncrementCount(term, label);
            }
        }
コード例 #19
0
        private static void RunTest(string input, string[] expected, string[] sentenceFinalPuncWords, bool whitespaceTokenize)
        {
            IList <string>       results  = new List <string>();
            DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)));

            if (sentenceFinalPuncWords != null)
            {
                document.SetSentenceFinalPuncWords(sentenceFinalPuncWords);
            }
            if (whitespaceTokenize)
            {
                document.SetTokenizerFactory(null);
                document.SetSentenceDelimiter("\n");
            }
            foreach (IList <IHasWord> sentence in document)
            {
                results.Add(SentenceUtils.ListToString(sentence));
            }
            NUnit.Framework.Assert.AreEqual("Should be " + expected.Length + " sentences but got " + results.Count + ": " + results, expected.Length, results.Count);
            for (int i = 0; i < results.Count; ++i)
            {
                NUnit.Framework.Assert.AreEqual("Failed on sentence " + i, expected[i], results[i]);
            }
        }
コード例 #20
0
        public virtual void TestAlwaysNewlineIsSentenceBreakSettings()
        {
            string text = "This is \none sentence\n\nThis is not another.";

            string[]        sents     = new string[] { "This is", "one sentence", "This is not another ." };
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(3, sentences.Count);
            // make sure that there are the correct # of tokens (count does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i);
            }
        }
コード例 #21
0
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         inputPath          = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        outputPath = args[argIndex + 1];
                        argIndex  += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                        {
                            inputPath = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                            {
                                Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                                argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                                testTreebankPath   = treebankDescription.First();
                                testTreebankFilter = treebankDescription.Second();
                            }
                            else
                            {
                                unusedArgs.Add(args[argIndex++]);
                            }
                        }
                    }
                }
            }
            string[]          newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser parser  = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            DVModel           model   = DVParser.GetModelFromLexicalizedParser(parser);
            File outputFile           = new File(outputPath);

            FileSystem.CheckNotExistsOrFail(outputFile);
            FileSystem.MkdirOrFail(outputFile);
            int count = 0;

            if (inputPath != null)
            {
                Reader input = new BufferedReader(new FileReader(inputPath));
                DocumentPreprocessor processor = new DocumentPreprocessor(input);
                foreach (IList <IHasWord> sentence in processor)
                {
                    count++;
                    // index from 1
                    IParserQuery pq = parser.ParserQuery();
                    if (!(pq is RerankingParserQuery))
                    {
                        throw new ArgumentException("Expected a RerankingParserQuery");
                    }
                    RerankingParserQuery rpq = (RerankingParserQuery)pq;
                    if (!rpq.Parse(sentence))
                    {
                        throw new Exception("Unparsable sentence: " + sentence);
                    }
                    IRerankerQuery reranker = rpq.RerankerQuery();
                    if (!(reranker is DVModelReranker.Query))
                    {
                        throw new ArgumentException("Expected a DVModelReranker");
                    }
                    DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0];
                    IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors();
                    foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors)
                    {
                        log.Info(entry.Key + "   " + entry.Value);
                    }
                    FileWriter     fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
                    BufferedWriter bout = new BufferedWriter(fout);
                    bout.Write(SentenceUtils.ListToString(sentence));
                    bout.NewLine();
                    bout.Write(deepTree.GetTree().ToString());
                    bout.NewLine();
                    foreach (IHasWord word in sentence)
                    {
                        OutputMatrix(bout, model.GetWordVector(word.Word()));
                    }
                    Tree rootTree = FindRootTree(vectors);
                    OutputTreeMatrices(bout, rootTree, vectors);
                    bout.Flush();
                    fout.Close();
                }
            }
        }
コード例 #22
0
        /// <summary>Test the parser on a treebank.</summary>
        /// <remarks>
        /// Test the parser on a treebank. Parses will be written to stdout, and
        /// various other information will be written to stderr and stdout,
        /// particularly if <code>op.testOptions.verbose</code> is true.
        /// </remarks>
        /// <param name="testTreebank">The treebank to parse</param>
        /// <returns>
        /// The labeled precision/recall F<sub>1</sub> (EVALB measure)
        /// of the parser on the treebank.
        /// </returns>
        public virtual double TestOnTreebank(Treebank testTreebank)
        {
            log.Info("Testing on treebank");
            Timing    treebankTotalTimer        = new Timing();
            TreePrint treePrint                 = op.testOptions.TreePrint(op.tlpParams);
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = op.Langpack();
            PrintWriter pwOut;
            PrintWriter pwErr;

            if (op.testOptions.quietEvaluation)
            {
                NullOutputStream quiet = new NullOutputStream();
                pwOut = tlpParams.Pw(quiet);
                pwErr = tlpParams.Pw(quiet);
            }
            else
            {
                pwOut = tlpParams.Pw();
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (op.testOptions.verbose)
            {
                pwErr.Print("Testing ");
                pwErr.Println(testTreebank.TextualSummary(tlp));
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(tlpParams);
            }
            PrintWriter pwFileOut = null;

            if (op.testOptions.writeOutputFiles)
            {
                string fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
                try
                {
                    pwFileOut = op.tlpParams.Pw(new FileOutputStream(fname));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            PrintWriter pwStats = null;

            if (op.testOptions.outputkBestEquivocation != null)
            {
                try
                {
                    pwStats = op.tlpParams.Pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            if (op.testOptions.testingThreads != 1)
            {
                MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
                LinkedList <Tree> goldTrees = new LinkedList <Tree>();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <IHasWord> sentence = GetInputSentence(goldTree);
                    goldTrees.Add(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    wrapper.Put(sentence);
                    while (wrapper.Peek())
                    {
                        IParserQuery pq = wrapper.Poll();
                        goldTree = goldTrees.Poll();
                        ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                    }
                }
                // for tree iterator
                wrapper.Join();
                while (wrapper.Peek())
                {
                    IParserQuery pq         = wrapper.Poll();
                    Tree         goldTree_1 = goldTrees.Poll();
                    ProcessResults(pq, goldTree_1, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            else
            {
                IParserQuery pq = pqFactory.ParserQuery();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <CoreLabel> sentence = GetInputSentence(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    pq.ParseAndReport(sentence, pwErr);
                    ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            // for tree iterator
            //Done parsing...print the results of the evaluations
            treebankTotalTimer.Done("Testing on treebank");
            if (op.testOptions.quietEvaluation)
            {
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (saidMemMessage)
            {
                ParserUtils.PrintOutOfMemory(pwErr);
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            if (numSkippedEvals != 0)
            {
                pwErr.Printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
            }
            // only created here so we know what parser types are supported...
            IParserQuery pq_1 = pqFactory.ParserQuery();

            if (summary)
            {
                if (pcfgLB != null)
                {
                    pcfgLB.Display(false, pwErr);
                }
                if (pcfgChildSpecific != null)
                {
                    pcfgChildSpecific.Display(false, pwErr);
                }
                if (pcfgLA != null)
                {
                    pcfgLA.Display(false, pwErr);
                }
                if (pcfgCB != null)
                {
                    pcfgCB.Display(false, pwErr);
                }
                if (pcfgDA != null)
                {
                    pcfgDA.Display(false, pwErr);
                }
                if (pcfgTA != null)
                {
                    pcfgTA.Display(false, pwErr);
                }
                if (pcfgLL != null && pq_1.GetPCFGParser() != null)
                {
                    pcfgLL.Display(false, pwErr);
                }
                if (depDA != null)
                {
                    depDA.Display(false, pwErr);
                }
                if (depTA != null)
                {
                    depTA.Display(false, pwErr);
                }
                if (depLL != null && pq_1.GetDependencyParser() != null)
                {
                    depLL.Display(false, pwErr);
                }
                if (factLB != null)
                {
                    factLB.Display(false, pwErr);
                }
                if (factChildSpecific != null)
                {
                    factChildSpecific.Display(false, pwErr);
                }
                if (factLA != null)
                {
                    factLA.Display(false, pwErr);
                }
                if (factCB != null)
                {
                    factCB.Display(false, pwErr);
                }
                if (factDA != null)
                {
                    factDA.Display(false, pwErr);
                }
                if (factTA != null)
                {
                    factTA.Display(false, pwErr);
                }
                if (factLL != null && pq_1.GetFactoredParser() != null)
                {
                    factLL.Display(false, pwErr);
                }
                if (pcfgCatE != null)
                {
                    pcfgCatE.Display(false, pwErr);
                }
                foreach (IEval eval in evals)
                {
                    eval.Display(false, pwErr);
                }
                foreach (BestOfTopKEval eval_1 in topKEvals)
                {
                    eval_1.Display(false, pwErr);
                }
            }
            // these ones only have a display mode, so display if turned on!!
            if (pcfgRUO != null)
            {
                pcfgRUO.Display(true, pwErr);
            }
            if (pcfgCUO != null)
            {
                pcfgCUO.Display(true, pwErr);
            }
            if (tsv)
            {
                NumberFormat nf = new DecimalFormat("0.00");
                pwErr.Println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && factDA != null)
                {
                    pwErr.Print(nf.Format(factDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetExactPercent()));
                }
                pwErr.Print("\t");
                if (pcfgLB != null)
                {
                    pwErr.Print(nf.Format(pcfgLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && depDA != null)
                {
                    pwErr.Print(nf.Format(depDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetPCFGParser() != null && factTA != null)
                {
                    pwErr.Print(nf.Format(factTA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(factLB.GetNum());
                }
                pwErr.Println();
            }
            double f1 = 0.0;

            if (factLB != null)
            {
                f1 = factLB.GetEvalbF1();
            }
            //Close files (if necessary)
            if (pwFileOut != null)
            {
                pwFileOut.Close();
            }
            if (pwStats != null)
            {
                pwStats.Close();
            }
            if (parserQueryEvals != null)
            {
                foreach (IParserQueryEval parserQueryEval in parserQueryEvals)
                {
                    parserQueryEval.Display(false, pwErr);
                }
            }
            return(f1);
        }
コード例 #23
0
        public static void Main(string[] args)
        {
            Properties props = StringUtils.ArgsToProperties(args);
            // logger.debug(props.toString());
            SeqClassifierFlags flags       = new SeqClassifierFlags(props);
            MaxMatchSegmenter  seg         = new MaxMatchSegmenter();
            string             lexiconFile = props.GetProperty("lexicon");

            if (lexiconFile != null)
            {
                seg.AddLexicon(lexiconFile);
            }
            else
            {
                logger.Error("Error: no lexicon file!");
                System.Environment.Exit(1);
            }
            Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();

            sighanRW.Init(flags);
            BufferedReader br      = new BufferedReader(new InputStreamReader(Runtime.@in));
            PrintWriter    stdoutW = new PrintWriter(System.Console.Out);
            int            lineNb  = 0;

            for (; ;)
            {
                ++lineNb;
                logger.Info("line: " + lineNb);
                try
                {
                    string line = br.ReadLine();
                    if (line == null)
                    {
                        break;
                    }
                    string outputLine = null;
                    if (props.GetProperty("greedy") != null)
                    {
                        List <Word> sentence = seg.GreedilySegmentWords(line);
                        outputLine = SentenceUtils.ListToString(sentence);
                    }
                    else
                    {
                        if (props.GetProperty("maxwords") != null)
                        {
                            seg.BuildSegmentationLattice(line);
                            outputLine = SentenceUtils.ListToString(seg.SegmentWords(MaxMatchSegmenter.MatchHeuristic.Maxwords));
                        }
                        else
                        {
                            seg.BuildSegmentationLattice(line);
                            outputLine = SentenceUtils.ListToString(seg.MaxMatchSegmentation());
                        }
                    }
                    StringReader strR = new StringReader(outputLine);
                    IEnumerator <IList <CoreLabel> > itr = sighanRW.GetIterator(strR);
                    while (itr.MoveNext())
                    {
                        sighanRW.PrintAnswers(itr.Current, stdoutW);
                    }
                }
                catch (IOException)
                {
                    // System.out.println(outputLine);
                    break;
                }
            }
            stdoutW.Flush();
        }
コード例 #24
0
        private ICoreMap DoOneSentence(ICoreMap sentence)
        {
            IList <CoreLabel>  tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <TaggedWord> tagged = null;

            if (tokens.Count <= maxSentenceLength)
            {
                try
                {
                    tagged = pos.TagSentence(tokens, this.reuseTags);
                }
                catch (OutOfMemoryException e)
                {
                    log.Error(e);
                    // Beware that we can now get an OOM in logging, too.
                    log.Warn("Tagging of sentence ran out of memory. " + "Will ignore and continue: " + SentenceUtils.ListToString(tokens));
                }
            }
            if (tagged != null)
            {
                for (int i = 0; i < sz; i++)
                {
                    tokens[i].Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), tagged[i].Tag());
                }
            }
            else
            {
                foreach (CoreLabel token in tokens)
                {
                    token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "X");
                }
            }
            return(sentence);
        }
コード例 #25
0
        // main method only
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string         outputFilename = string.Empty;
            string         tagSeparator   = string.Empty;
            string         treeRange      = string.Empty;
            string         inputEncoding  = "UTF-8";
            string         outputEncoding = "UTF-8";
            string         treeFilter     = string.Empty;
            bool           noTags         = false;
            bool           noSpaces       = false;
            IList <string> inputFilenames = new List <string>();

            for (int i = 0; i < args.Length; ++i)
            {
                if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-output") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--output")) && (i + 1 < args.Length))
                {
                    outputFilename = args[i + 1];
                    i++;
                }
                else
                {
                    if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tagSeparator") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--tagSeparator")) && (i + 1 < args.Length))
                    {
                        tagSeparator = args[i + 1];
                        i++;
                    }
                    else
                    {
                        if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-treeRange") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--treeRange")) && (i + 1 < args.Length))
                        {
                            treeRange = args[i + 1];
                            i++;
                        }
                        else
                        {
                            if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-inputEncoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--inputEncoding")) && (i + 1 < args.Length))
                            {
                                inputEncoding = args[i + 1];
                                i++;
                            }
                            else
                            {
                                if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-outputEncoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--outputEncoding")) && (i + 1 < args.Length))
                                {
                                    outputEncoding = args[i + 1];
                                    i++;
                                }
                                else
                                {
                                    if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-treeFilter") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--treeFilter")) && (i + 1 < args.Length))
                                    {
                                        treeFilter = args[i + 1];
                                        i++;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noTags") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--noTags"))
                                        {
                                            noTags = true;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noSpaces") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--noSpaces"))
                                            {
                                                noSpaces = true;
                                            }
                                            else
                                            {
                                                inputFilenames.Add(args[i]);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (outputFilename.Equals(string.Empty))
            {
                log.Info("Must specify an output filename, -output");
                System.Environment.Exit(2);
            }
            if (inputFilenames.Count == 0)
            {
                log.Info("Must specify one or more input filenames");
                System.Environment.Exit(2);
            }
            FileOutputStream   fos   = new FileOutputStream(outputFilename);
            OutputStreamWriter osw   = new OutputStreamWriter(fos, outputEncoding);
            BufferedWriter     bout  = new BufferedWriter(osw);
            Properties         props = new Properties();

            foreach (string filename in inputFilenames)
            {
                string description = TaggedFileRecord.Format + "=" + TaggedFileRecord.Format.Trees + "," + filename;
                if (!treeRange.IsEmpty())
                {
                    description = TaggedFileRecord.TreeRange + "=" + treeRange + "," + description;
                }
                if (!treeFilter.IsEmpty())
                {
                    description = TaggedFileRecord.TreeFilter + "=" + treeFilter + "," + description;
                }
                description = TaggedFileRecord.Encoding + "=" + inputEncoding + "," + description;
                TaggedFileRecord record = TaggedFileRecord.CreateRecord(props, description);
                foreach (IList <TaggedWord> sentence in record.Reader())
                {
                    string output = SentenceUtils.ListToString(sentence, noTags, tagSeparator);
                    if (noSpaces)
                    {
                        output = output.ReplaceAll(" ", string.Empty);
                    }
                    bout.Write(output);
                    bout.NewLine();
                }
            }
            bout.Flush();
            bout.Close();
            osw.Close();
            fos.Close();
        }
コード例 #26
0
        /// <summary>Run the scoring metric on guess/gold input.</summary>
        /// <remarks>
        /// Run the scoring metric on guess/gold input. This method performs "Collinization."
        /// The default language is English.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            int    maxGoldYield            = int.MaxValue;
            int    maxGuessYield           = int.MaxValue;
            bool   Verbose   = false;
            bool   skipGuess = false;
            bool   tagMode   = false;
            string guessFile = null;
            string goldFile  = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-y":
                    {
                        maxGoldYield = System.Convert.ToInt32(args[++i].Trim());
                        break;
                    }

                    case "-t":
                    {
                        tagMode = true;
                        break;
                    }

                    case "-v":
                    {
                        Verbose = true;
                        break;
                    }

                    case "-g":
                    {
                        maxGuessYield = System.Convert.ToInt32(args[++i].Trim());
                        skipGuess     = true;
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    //Required parameters
                    goldFile  = args[i++];
                    guessFile = args[i];
                    break;
                }
            }
            PrintWriter pwOut         = tlpp.Pw();
            Treebank    guessTreebank = tlpp.DiskTreebank();

            guessTreebank.LoadPath(guessFile);
            pwOut.Println("GUESS TREEBANK:");
            pwOut.Println(guessTreebank.TextualSummary());
            Treebank goldTreebank = tlpp.DiskTreebank();

            goldTreebank.LoadPath(goldFile);
            pwOut.Println("GOLD TREEBANK:");
            pwOut.Println(goldTreebank.TextualSummary());
            string evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG";

            Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval eval = new Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval(evalName, tagMode);
            ITreeTransformer tc = tlpp.Collinizer();
            //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees
            //don't match, we need to keep looking for the next gold tree that matches.
            //The evalb ref implementation differs slightly as it expects one tree per line. It assigns
            //status as follows:
            //
            //   0 - Ok (yields match)
            //   1 - length mismatch
            //   2 - null parse e.g. (()).
            //
            //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation.
            IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator();
            int goldLineId             = 0;
            int skippedGuessTrees      = 0;

            foreach (Tree guess in guessTreebank)
            {
                Tree          evalGuess  = tc.TransformTree(guess);
                List <ILabel> guessSent  = guess.Yield();
                string        guessChars = SentenceUtils.ListToString(guessSent).ReplaceAll("\\s+", string.Empty);
                if (guessSent.Count > maxGuessYield)
                {
                    skippedGuessTrees++;
                    continue;
                }
                bool doneEval = false;
                while (goldItr.MoveNext() && !doneEval)
                {
                    Tree gold     = goldItr.Current;
                    Tree evalGold = tc.TransformTree(gold);
                    goldLineId++;
                    List <ILabel> goldSent  = gold.Yield();
                    string        goldChars = SentenceUtils.ListToString(goldSent).ReplaceAll("\\s+", string.Empty);
                    if (goldSent.Count > maxGoldYield)
                    {
                        continue;
                    }
                    else
                    {
                        if (goldChars.Length != guessChars.Length)
                        {
                            pwOut.Printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.Length, goldChars.Length);
                            skippedGuessTrees++;
                            break;
                        }
                    }
                    //Default evalb behavior -- skip this guess tree
                    eval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null));
                    doneEval = true;
                }
            }
            //Move to the next guess parse
            pwOut.Println("================================================================================");
            if (skippedGuessTrees != 0)
            {
                pwOut.Printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees);
            }
            eval.Display(true, pwOut);
            pwOut.Println();
            pwOut.Close();
        }
コード例 #27
0
        public static void Main(string[] args)
        {
            // Args specified with -tagSeparator, -encoding, etc are assigned
            // to the appropriate option.  Otherwise, the first arg found is
            // the sentence to look for, and all other args are paths in which
            // to look for that sentence.
            string         needle       = string.Empty;
            string         tagSeparator = "_";
            string         encoding     = "utf-8";
            string         fileRegex    = string.Empty;
            IList <string> paths        = new List <string>();

            for (int i = 0; i < args.Length; ++i)
            {
                if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tagSeparator") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--tagSeparator")) && i + 1 < args.Length)
                {
                    tagSeparator = args[i + 1];
                    ++i;
                }
                else
                {
                    if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-encoding") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--encoding")) && i + 1 < args.Length)
                    {
                        encoding = args[i + 1];
                        ++i;
                    }
                    else
                    {
                        if ((Sharpen.Runtime.EqualsIgnoreCase(args[i], "-fileRegex") || Sharpen.Runtime.EqualsIgnoreCase(args[i], "--fileRegex")) && i + 1 < args.Length)
                        {
                            fileRegex = args[i + 1];
                            ++i;
                        }
                        else
                        {
                            if (needle.Equals(string.Empty))
                            {
                                needle = args[i].Trim();
                            }
                            else
                            {
                                paths.Add(args[i]);
                            }
                        }
                    }
                }
            }
            ITreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
            // If the user specified a regex, here we make a filter using that
            // regex.  We just use an anonymous class for the filter
            IFileFilter filter = null;

            if (!fileRegex.Equals(string.Empty))
            {
                Pattern filePattern = Pattern.Compile(fileRegex);
                filter = null;
            }
            foreach (string path in paths)
            {
                // Start a new treebank with the given path, encoding, filter, etc
                DiskTreebank treebank = new DiskTreebank(trf, encoding);
                treebank.LoadPath(path, filter);
                IEnumerator <Tree> treeIterator = treebank.GetEnumerator();
                int    treeCount   = 0;
                string currentFile = string.Empty;
                while (treeIterator.MoveNext())
                {
                    // the treebank might be a directory, not a single file, so
                    // keep track of which file we are currently looking at
                    if (!currentFile.Equals(treebank.GetCurrentFilename()))
                    {
                        currentFile = treebank.GetCurrentFilename();
                        treeCount   = 0;
                    }
                    ++treeCount;
                    Tree tree = treeIterator.Current;
                    IList <TaggedWord> sentence = tree.TaggedYield();
                    bool found = false;
                    // The tree can match in one of three ways: tagged, untagged,
                    // or untagged and unsegmented (which is useful for Chinese,
                    // for example)
                    string haystack = SentenceUtils.ListToString(sentence, true);
                    found    = needle.Equals(haystack);
                    haystack = haystack.ReplaceAll(" ", string.Empty);
                    found    = found || needle.Equals(haystack);
                    haystack = SentenceUtils.ListToString(sentence, false, tagSeparator);
                    found    = found || needle.Equals(haystack);
                    if (found)
                    {
                        System.Console.Out.WriteLine("needle found in " + currentFile + " tree " + treeCount);
                    }
                }
            }
        }
コード例 #28
0
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
コード例 #29
0
        private IList <Tree> DoOneSentence(IList <ParserConstraint> constraints, IList <CoreLabel> words)
        {
            IParserQuery pq = parser.ParserQuery();

            pq.SetConstraints(constraints);
            pq.Parse(words);
            IList <Tree> trees = Generics.NewLinkedList();

            try
            {
                // Use bestParse if kBest is set to 1.
                if (this.kBest == 1)
                {
                    Tree t = pq.GetBestParse();
                    if (t == null)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        double score = pq.GetBestScore();
                        t.SetScore(score % -10000.0);
                        trees.Add(t);
                    }
                }
                else
                {
                    IList <ScoredObject <Tree> > scoredObjects = pq.GetKBestParses(this.kBest);
                    if (scoredObjects == null || scoredObjects.Count < 1)
                    {
                        log.Warn("Parsing of sentence failed.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
                    }
                    else
                    {
                        foreach (ScoredObject <Tree> so in scoredObjects)
                        {
                            // -10000 denotes unknown words
                            Tree tree = so.Object();
                            tree.SetScore(so.Score() % -10000.0);
                            trees.Add(tree);
                        }
                    }
                }
            }
            catch (OutOfMemoryException e)
            {
                log.Error(e);
                // Beware that we can now get an OOM in logging, too.
                log.Warn("Parsing of sentence ran out of memory (length=" + words.Count + ").  " + "Will ignore and try to continue.");
            }
            catch (NoSuchParseException)
            {
                log.Warn("Parsing of sentence failed, possibly because of out of memory.  " + "Will ignore and continue: " + SentenceUtils.ListToString(words));
            }
            return(trees);
        }
コード例 #30
0
        /// <summary>TODO: clearly this should be a default method in ParserQuery once Java 8 comes out</summary>
        public virtual void RestoreOriginalWords(Tree tree)
        {
            if (originalSentence == null || tree == null)
            {
                return;
            }
            IList <Tree> leaves = tree.GetLeaves();

            if (leaves.Count != originalSentence.Count)
            {
                throw new InvalidOperationException("originalWords and sentence of different sizes: " + originalSentence.Count + " vs. " + leaves.Count + "\n Orig: " + SentenceUtils.ListToString(originalSentence) + "\n Pars: " + SentenceUtils.ListToString(leaves
                                                                                                                                                                                                                                                                ));
            }
            // TODO: get rid of this cast
            IEnumerator <ILabel> wordsIterator = (IEnumerator <ILabel>)originalSentence.GetEnumerator();

            foreach (Tree leaf in leaves)
            {
                leaf.SetLabel(wordsIterator.Current);
            }
        }