예제 #1
0
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo2 modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            BufferedReader       r  = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
            PrintWriter          pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8"));
            DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory);
            foreach (IList <IHasWord> sentence in documentPreprocessor)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                pw.Println(SentenceUtils.ListToString(tSentence, false));
            }
            // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
            IList <IHasWord>   sent       = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
            IList <TaggedWord> taggedSent = tagger.TagSentence(sent);

            foreach (TaggedWord tw in taggedSent)
            {
                if (tw.Tag().StartsWith("JJ"))
                {
                    pw.Println(tw.Word());
                }
            }
            pw.Close();
        }
        /// <summary>Perform (possibly destructive) operations on the tree.</summary>
        /// <remarks>Perform (possibly destructive) operations on the tree. Do a top-down DFS on the tree.</remarks>
        public virtual void VisitTree(Tree tree)
        {
            if (tree == null)
            {
                return;
            }
            string yield = SentenceUtils.ListToString(tree.Yield());

            if (mweDictionary.Contains(yield))
            {
                IList <Tree> children = GetPreterminalSubtrees(tree);
                string       newLabel = "MW" + tree.Value();
                tree.SetValue(newLabel);
                tree.SetChildren(children);
                // Bottom out of the recursion
                return;
            }
            else
            {
                foreach (Tree subTree in tree.Children())
                {
                    if (subTree.IsPhrasal())
                    {
                        // Only phrasal trees can have yields > 1!!
                        VisitTree(subTree);
                    }
                }
            }
        }
예제 #3
0
        public static IList <CoreLabel> StringToIOB(string str, char segMarker)
        {
            // Whitespace tokenization
            IList <CoreLabel> toks = SentenceUtils.ToCoreLabelList(str.Trim().Split("\\s+"));

            return(StringToIOB(toks, segMarker, false));
        }
예제 #4
0
 public virtual void PrintSentences(IEnumerable <IList <IHasWord> > sentences, string filename)
 {
     try
     {
         PrintWriter pw = IOUtils.GetPrintWriter(filename);
         foreach (IList <IHasWord> sentence in sentences)
         {
             pw.Print("<s> ");
             // Note: Use <s sentence-id > to identify sentences
             string sentString = SentenceUtils.ListToString(sentence);
             if (sentence.Count > maxSentenceLength)
             {
                 logger.Warning("Sentence length=" + sentence.Count + " is longer than maximum set length " + maxSentenceLength);
                 logger.Warning("Long Sentence: " + sentString);
             }
             pw.Print(sentString);
             pw.Println(" </s>");
         }
         pw.Close();
     }
     catch (IOException ex)
     {
         throw new Exception(ex);
     }
 }
예제 #5
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
예제 #6
0
        static void Main(string[] args)
        {
            // Loading POS Tagger
            var tagger = new MaxentTagger(@"Resources/english-bidirectional-distsim.tagger");

            // Text for tagging
            //var text = @"یک روز آمدم ";
            var text = "hello how are you?";
            IList <Tuple <string, string> > tagged = new List <Tuple <string, string> >();


            var sentences = MaxentTagger.tokenizeText(new StringReader(text)).toArray();

            foreach (ArrayList sentence in sentences)
            {
                var taggedSentence = tagger.tagSentence(sentence);
                System.Console.WriteLine(SentenceUtils.listToString(taggedSentence, false));

                for (int i = 0; i < taggedSentence.size(); i++)
                {
                    var t = taggedSentence.toArray()[i].ToString().Split('/');
                    tagged.Add(Tuple.Create(t[0], t[1]));
                }
            }
        }
예제 #7
0
        static void Main()
        {
            var jarRoot         = @"C:\Users\Burds\Downloads\Stanford.NLP.NET-master (1)\Stanford.NLP.NET-master\samples\Stanford.NLP.POSTagger.CSharp\bin\Debug\stanford-postagger-2018-02-27";
            var modelsDirectory = jarRoot + @"\models";

            // Loading POS Tagger
            var tagger = new MaxentTagger(modelsDirectory + @"\english-left3words-distsim.tagger");

            // Text for tagging
            var text = "This is a test sentence.";

            string[] arr       = new string[10];
            var      sentences = MaxentTagger.tokenizeText(new StringReader(text)).toArray();

            string[] getType = new string[10];
            foreach (ArrayList sentence in sentences)
            {
                var taggedSentence = tagger.tagSentence(sentence);
                Console.WriteLine(SentenceUtils.listToString(taggedSentence, false));
                var data = new List <DataClass>();

                for (int i = 0; i < taggedSentence.size() - 1; i++)
                {
                    string myString = taggedSentence.get(i).ToString();

                    data.Add(new DataClass
                    {
                        SWord = sentence.get(i).ToString(),
                        WType = myString.Substring(myString.IndexOf("/") + 1)
                    });
                    //getType[i] = myString.Substring(myString.IndexOf("/") + 1);
                }
            }
        }
예제 #8
0
        public void ParseEasySentence()
        {
            // This option shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var parse    = _lp.apply(rawWords);

            Assert.NotNull(parse);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            using var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            parse = _lp.apply(rawWords2);
            Assert.NotNull(parse);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            TestContext.Out.WriteLine($"\n{tdl}\n");

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            Assert.NotNull(tp);
            tp.printTree(parse);
        }
        /// <summary>Make a new Annotation from a List of tokenized sentences.</summary>
        public Annotation(IList <ICoreMap> sentences)
            : base()
        {
            this.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
            IList <CoreLabel> tokens = new List <CoreLabel>();
            StringBuilder     text   = new StringBuilder();

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> sentenceTokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                Sharpen.Collections.AddAll(tokens, sentenceTokens);
                if (sentence.ContainsKey(typeof(CoreAnnotations.TextAnnotation)))
                {
                    text.Append(sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                }
                else
                {
                    // If there is no text in the sentence, fake it as best as we can
                    if (text.Length > 0)
                    {
                        text.Append('\n');
                    }
                    text.Append(SentenceUtils.ListToString(sentenceTokens));
                }
            }
            this.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            this.Set(typeof(CoreAnnotations.TextAnnotation), text.ToString());
        }
        /// <summary>Returns the string associated with the input parse tree.</summary>
        /// <remarks>
        /// Returns the string associated with the input parse tree. Traces and
        /// ATB-specific escape sequences (e.g., "-RRB-" for ")") are removed.
        /// </remarks>
        /// <param name="t">- A parse tree</param>
        /// <returns>The yield of the input parse tree</returns>
        public static string FlattenTree(Tree t)
        {
            t = t.Prune(emptyFilter, tf);
            string flatString = SentenceUtils.ListToString(t.Yield());

            return(flatString);
        }
        public virtual void Build()
        {
            LineNumberReader infile        = null;
            PrintWriter      outfile       = null;
            string           currentInfile = string.Empty;

            try
            {
                outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName), "UTF-8")));
                foreach (File path in pathsToData)
                {
                    infile        = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")));
                    currentInfile = path.GetPath();
                    while (infile.Ready())
                    {
                        List <Word> sent = SentenceUtils.ToUntaggedList(infile.ReadLine().Split("\\s+"));
                        foreach (Word token in sent)
                        {
                            Matcher hasArabic = utf8ArabicChart.Matcher(token.Word());
                            if (hasArabic.Find())
                            {
                                token.SetWord(escaper.Apply(token.Word()));
                                token.SetWord(lexMapper.Map(null, token.Word()));
                            }
                        }
                        outfile.Println(SentenceUtils.ListToString(sent));
                    }
                    toStringBuffer.Append(string.Format(" Read %d input lines from %s", infile.GetLineNumber(), path.GetPath()));
                }
                infile.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                System.Console.Error.Printf("%s: Filesystem does not support UTF-8 output\n", this.GetType().FullName);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException)
            {
                System.Console.Error.Printf("%s: Could not open %s for writing\n", this.GetType().FullName, outFileName);
            }
            catch (IOException)
            {
                System.Console.Error.Printf("%s: Error reading from %s (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber());
            }
            catch (Exception e)
            {
                System.Console.Error.Printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", this.GetType().FullName, currentInfile, infile.GetLineNumber());
                Sharpen.Runtime.PrintStackTrace(e);
            }
            finally
            {
                if (outfile != null)
                {
                    outfile.Close();
                }
            }
        }
예제 #12
0
        private void TagReader(Reader reader)
        {
            var sentences = MaxentTagger.tokenizeText(reader).toArray();

            Assert.NotNull(sentences);

            foreach (ArrayList sentence in sentences)
            {
                var tSentence = _tagger.tagSentence(sentence);
                TestContext.Out.WriteLine(SentenceUtils.listToString(tSentence, false));
            }
        }
예제 #13
0
        /// <summary>Reads an annotation from the given filename using the requested input.</summary>
        public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown)
        {
            switch (inputFormat)
            {
            case SentimentPipeline.Input.Text:
            {
                string     text       = IOUtils.SlurpFileNoExceptions(filename);
                Annotation annotation = new Annotation(text);
                tokenizer.Annotate(annotation);
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                    nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
                    annotations.Add(nextAnnotation);
                }
                return(annotations);
            }

            case SentimentPipeline.Input.Trees:
            {
                IList <Tree> trees;
                if (filterUnknown)
                {
                    trees = SentimentUtils.ReadTreesWithGoldLabels(filename);
                    trees = SentimentUtils.FilterUnknownRoots(trees);
                }
                else
                {
                    MemoryTreebank treebank = new MemoryTreebank("utf-8");
                    treebank.LoadPath(filename, null);
                    trees = new List <Tree>(treebank);
                }
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (Tree tree in trees)
                {
                    ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield()));
                    sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                    IList <ICoreMap> sentences  = Java.Util.Collections.SingletonList(sentence);
                    Annotation       annotation = new Annotation(string.Empty);
                    annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
                    annotations.Add(annotation);
                }
                return(annotations);
            }

            default:
            {
                throw new ArgumentException("Unknown format " + inputFormat);
            }
            }
        }
예제 #14
0
        /// <summary>Tags the sentence s by running maxent model.</summary>
        /// <remarks>
        /// Tags the sentence s by running maxent model.  Returns a sentence (List) of
        /// TaggedWord objects.
        /// </remarks>
        /// <param name="s">Input sentence (List).  This isn't changed.</param>
        /// <returns>Tagged sentence</returns>
        public virtual List <TaggedWord> TagSentence <_T0>(IList <_T0> s, bool reuseTags)
            where _T0 : IHasWord
        {
            this.origWords = new List <IHasWord>(s);
            int sz = s.Count;

            this.sent = new List <string>(sz + 1);
            foreach (IHasWord value1 in s)
            {
                if (maxentTagger.wordFunction != null)
                {
                    sent.Add(maxentTagger.wordFunction.Apply(value1.Word()));
                }
                else
                {
                    sent.Add(value1.Word());
                }
            }
            sent.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord);
            if (reuseTags)
            {
                this.originalTags = new List <string>(sz + 1);
                foreach (IHasWord value in s)
                {
                    if (value is IHasTag)
                    {
                        originalTags.Add(((IHasTag)value).Tag());
                    }
                    else
                    {
                        originalTags.Add(null);
                    }
                }
                originalTags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag);
            }
            size = sz + 1;
            if (Verbose)
            {
                log.Info("Sentence is " + SentenceUtils.ListToString(sent, false, tagSeparator));
            }
            Init();
            List <TaggedWord> result = TestTagInference();

            if (maxentTagger.wordFunction != null)
            {
                for (int j = 0; j < sz; ++j)
                {
                    result[j].SetWord(s[j].Word());
                }
            }
            return(result);
        }
        /// <summary>Converts a parse tree into a string of tokens.</summary>
        /// <remarks>
        /// Converts a parse tree into a string of tokens. Each token is a word and
        /// its POS tag separated by the delimiter specified by <code>separator</code>
        /// </remarks>
        /// <param name="t">- A parse tree</param>
        /// <param name="removeEscaping">- If true, remove LDC escape characters. Otherwise, leave them.</param>
        /// <param name="separator">Word/tag separator</param>
        /// <returns>A string of tagged words</returns>
        public static string TaggedStringFromTree(Tree t, bool removeEscaping, string separator)
        {
            t = t.Prune(emptyFilter, tf);
            IList <CoreLabel> taggedSentence = t.TaggedLabeledYield();

            foreach (CoreLabel token in taggedSentence)
            {
                string word = (removeEscaping) ? UnEscape(token.Word()) : token.Word();
                token.SetWord(word);
                token.SetValue(word);
            }
            return(SentenceUtils.ListToString(taggedSentence, false, separator));
        }
예제 #16
0
        public virtual void TestSimpleTrigger()
        {
            IList <TokenSequencePattern> patterns = new List <TokenSequencePattern>();

            patterns.Add(TokenSequencePattern.Compile("which word should be matched"));
            MultiPatternMatcher.ISequencePatternTrigger <ICoreMap> trigger = new MultiPatternMatcher.BasicSequencePatternTrigger <ICoreMap>(new CoreMapNodePatternTrigger(patterns));
            ICollection <SequencePattern <ICoreMap> > triggered            = trigger.Apply(SentenceUtils.ToCoreLabelList("one", "two", "three"));

            NUnit.Framework.Assert.AreEqual(0, triggered.Count);
            triggered = trigger.Apply(SentenceUtils.ToCoreLabelList("which"));
            NUnit.Framework.Assert.AreEqual(0, triggered.Count);
            triggered = trigger.Apply(SentenceUtils.ToCoreLabelList("which", "word", "should", "be", "matched"));
            NUnit.Framework.Assert.AreEqual(1, triggered.Count);
        }
예제 #17
0
        public virtual void TestInitialStateFromTagged()
        {
            string[] words = new string[] { "This", "is", "a", "short", "test", "." };
            string[] tags  = new string[] { "DT", "VBZ", "DT", "JJ", "NN", "." };
            NUnit.Framework.Assert.AreEqual(words.Length, tags.Length);
            IList <TaggedWord> sentence = SentenceUtils.ToTaggedList(Arrays.AsList(words), Arrays.AsList(tags));
            State state = ShiftReduceParser.InitialStateFromTaggedSentence(sentence);

            for (int i = 0; i < words.Length; ++i)
            {
                NUnit.Framework.Assert.AreEqual(tags[i], state.sentence[i].Value());
                NUnit.Framework.Assert.AreEqual(1, state.sentence[i].Children().Length);
                NUnit.Framework.Assert.AreEqual(words[i], state.sentence[i].Children()[0].Value());
            }
        }
예제 #18
0
        public virtual void TestTransition()
        {
            string[] words = new string[] { "This", "is", "a", "short", "test", "." };
            string[] tags  = new string[] { "DT", "VBZ", "DT", "JJ", "NN", "." };
            NUnit.Framework.Assert.AreEqual(words.Length, tags.Length);
            IList <TaggedWord> sentence = SentenceUtils.ToTaggedList(Arrays.AsList(words), Arrays.AsList(tags));
            State           state       = ShiftReduceParser.InitialStateFromTaggedSentence(sentence);
            ShiftTransition shift       = new ShiftTransition();

            for (int i = 0; i < 3; ++i)
            {
                state = shift.Apply(state);
            }
            NUnit.Framework.Assert.AreEqual(3, state.tokenPosition);
        }
예제 #19
0
        public virtual void TestFromList()
        {
            IList <ICoreMap>  sentences = Generics.NewArrayList();
            ICoreMap          sentence  = new ArrayCoreMap();
            IList <CoreLabel> words     = SentenceUtils.ToCoreLabelList("This", "is", "a", "test", ".");

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words);
            sentences.Add(sentence);
            Annotation annotation = new Annotation(sentences);

            NUnit.Framework.Assert.AreEqual("This is a test .", annotation.ToString());
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), "This is a test.");
            annotation = new Annotation(sentences);
            NUnit.Framework.Assert.AreEqual("This is a test.", annotation.ToString());
        }
        private Tree PostProcessMWE(Tree t)
        {
            string tYield = SentenceUtils.ListToString(t.Yield()).ReplaceAll("\\s+", string.Empty);

            if (tYield.Matches("[\\d\\p{Punct}]*"))
            {
                IList <Tree> kids = new List <Tree>();
                kids.Add(treeFactory.NewLeaf(tYield));
                t = treeFactory.NewTreeNode(t.Value(), kids);
            }
            else
            {
                t.SetValue(MwePhrasal + t.Value());
            }
            return(t);
        }
        private static void CompareXMLResults(string input, string element, params string[] expectedResults)
        {
            List <string>        results  = new List <string>();
            DocumentPreprocessor document = new DocumentPreprocessor(new BufferedReader(new StringReader(input)), DocumentPreprocessor.DocType.Xml);

            document.SetElementDelimiter(element);
            foreach (IList <IHasWord> sentence in document)
            {
                results.Add(SentenceUtils.ListToString(sentence));
            }
            NUnit.Framework.Assert.AreEqual(expectedResults.Length, results.Count);
            for (int i = 0; i < results.Count; ++i)
            {
                NUnit.Framework.Assert.AreEqual(expectedResults[i], results[i]);
            }
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            IList <IList <IHasWord> > sentences = MaxentTagger.TokenizeText(new BufferedReader(new FileReader(args[1])));

            foreach (IList <IHasWord> sentence in sentences)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                System.Console.Out.WriteLine(SentenceUtils.ListToString(tSentence, false));
            }
        }
        // static methods
        public static void SetLabels(Tree tree, IDictionary <string, string> labelMap, ParseAndSetLabels.MissingLabels missing, string defaultLabel, ICollection <string> unknowns)
        {
            if (tree.IsLeaf())
            {
                return;
            }
            string text  = SentenceUtils.ListToString(tree.Yield());
            string label = labelMap[text];

            if (label != null)
            {
                tree.Label().SetValue(label);
            }
            else
            {
                switch (missing)
                {
                case ParseAndSetLabels.MissingLabels.Fail:
                {
                    throw new Exception("No label for '" + text + "'");
                }

                case ParseAndSetLabels.MissingLabels.Default:
                {
                    tree.Label().SetValue(defaultLabel);
                    unknowns.Add(text);
                    break;
                }

                case ParseAndSetLabels.MissingLabels.KeepOriginal:
                {
                    // do nothing
                    break;
                }

                default:
                {
                    throw new ArgumentException("Unknown MissingLabels mode " + missing);
                }
                }
            }
            foreach (Tree child in tree.Children())
            {
                SetLabels(child, labelMap, missing, defaultLabel, unknowns);
            }
        }
        public virtual void TestBinarySide()
        {
            string[] words = new string[] { "This", "is", "a", "short", "test", "." };
            string[] tags  = new string[] { "DT", "VBZ", "DT", "JJ", "NN", "." };
            NUnit.Framework.Assert.AreEqual(words.Length, tags.Length);
            IList <TaggedWord> sentence = SentenceUtils.ToTaggedList(Arrays.AsList(words), Arrays.AsList(tags));
            State           state       = ShiftReduceParser.InitialStateFromTaggedSentence(sentence);
            ShiftTransition shift       = new ShiftTransition();

            state = shift.Apply(shift.Apply(state));
            BinaryTransition transition = new BinaryTransition("NP", BinaryTransition.Side.Right);
            State            next       = transition.Apply(state);

            NUnit.Framework.Assert.AreEqual(BinaryTransition.Side.Right, ShiftReduceUtils.GetBinarySide(next.stack.Peek()));
            transition = new BinaryTransition("NP", BinaryTransition.Side.Left);
            next       = transition.Apply(state);
            NUnit.Framework.Assert.AreEqual(BinaryTransition.Side.Left, ShiftReduceUtils.GetBinarySide(next.stack.Peek()));
        }
        public virtual void TestArabicTokenizer()
        {
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            for (int i = 0; i < untokInputs.Length; ++i)
            {
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
            }
        }
예제 #26
0
        public virtual void TestSpanishDatelineSeparation()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex"
                                                            , "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/  /[-\\p{L}]+/+ )? " + "( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

            NUnit.Framework.Assert.AreEqual(dateLineSpanishTexts.Length, dateLineSpanishTokens.Length, "Bad test data");
            for (int i = 0; i < dateLineSpanishTexts.Length; i++)
            {
                Annotation document1 = new Annotation(dateLineSpanishTexts[i]);
                pipeline.Annotate(document1);
                IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));
                NUnit.Framework.Assert.AreEqual(2, sentences.Count, "For " + dateLineSpanishTexts[i] + " annotation is " + document1);
                IList <CoreLabel> sentenceOneTokens = sentences[0].Get(typeof(CoreAnnotations.TokensAnnotation));
                string            sentenceOne       = SentenceUtils.ListToString(sentenceOneTokens);
                NUnit.Framework.Assert.AreEqual(dateLineSpanishTokens[i], sentenceOne, "Bad tokens in dateline");
            }
        }
예제 #27
0
 public override void Evaluate(Tree guess, Tree gold, PrintWriter pw)
 {
     if (gold == null || guess == null)
     {
         System.Console.Error.Printf("%s: Cannot compare against a null gold or guess tree!\n", this.GetType().FullName);
         return;
     }
     else
     {
         if (guess.Yield().Count != gold.Yield().Count)
         {
             log.Info("Warning: yield differs:");
             log.Info("Guess: " + SentenceUtils.ListToString(guess.Yield()));
             log.Info("Gold:  " + SentenceUtils.ListToString(gold.Yield()));
         }
     }
     base.Evaluate(guess, gold, pw);
 }
예제 #28
0
        public List <string> FindTag(string text, List <string> tag)
        {
            List <string> adj = new List <string>();

            object[] sentences = MaxentTagger.tokenizeText(new java.io.StringReader(text)).toArray();
            foreach (ArrayList sentence in sentences)
            {
                List taggedSentence = this.tagger.tagSentence(sentence);
                foreach (CoreLabel word in SentenceUtils.toCoreLabelList(taggedSentence).toArray())
                {
                    if (tag.Contains(word.tag()))
                    {
                        adj.Add(word.word());
                    }
                }
            }
            return(adj);
        }
예제 #29
0
        public string send(string text)
        {
            string[] exampleWords = text.Split(
                new char[] { ' ', ',', '.', ')', '(' }, StringSplitOptions.RemoveEmptyEntries);

            ILemmatizer lmtz = new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English);

            StringBuilder sb = new StringBuilder();

            foreach (string word in exampleWords)
            {
                sb.Append(LemmatizeOne(lmtz, word) + " ");
            }

            string finalstring = sb.ToString();


            var jarRoot         = @"E:\stanford-postagger-full-2015-12-09\stanford-postagger-full-2015-12-09";
            var modelsDirectory = jarRoot + @"\models";
            // Loading POS Tagger
            var tagger = new MaxentTagger(modelsDirectory + @"\wsj-0-18-bidirectional-nodistsim.tagger");

            // Text for tagging
            StringBuilder str = new StringBuilder();

            var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(finalstring)).toArray();

            foreach (ArrayList sentence in sentences)
            {
                var    taggedSentence = tagger.tagSentence(sentence);
                string sent           = SentenceUtils.listToString(taggedSentence, false);

                String[] tokens = sent.Split(' ');
                for (int i = 0; i < tokens.Length; i++)
                {
                    if (tokens[i].Contains("/VB"))
                    {
                        str.Append(tokens[i] + " ");
                    }
                }
            }
            return(str.ToString());
        }
예제 #30
0
        public virtual void TestTwoNewlineIsSentenceBreakTokenizeNLs()
        {
            string          text      = "This is \none sentence\n\nThis is not another.";
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(2, sentences.Count);
            // make sure that there are the correct # of tokens (does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            IList <CoreLabel> sentenceTwoTokens = sentences[1].Get(typeof(CoreAnnotations.TokensAnnotation));
            string            sentenceTwo       = SentenceUtils.ListToString(sentenceTwoTokens);

            NUnit.Framework.Assert.AreEqual("This is not another .", sentenceTwo, "Bad tokens in sentence");
        }