예제 #1
0
        /// <summary>
        /// Gets stem text
        /// </summary>
        /// <param name="text">Text to stem</param>
        /// <returns>Text that is stemmed</returns>
        public string GetStemmedText(string text)
        {
            try
            {
                // Annotation
                var annotation = new Annotation(text);
                _pipeLine.annotate(annotation);

                // Sentence
                ArrayList sentences = annotation.get(_sentencesAnnotation.getClass()) as ArrayList;
                CoreMap   sentence  = sentences.get(0) as CoreMap;

                // Token
                ArrayList tokens = sentence.get(_tokensAnnotation.getClass()) as ArrayList;
                CoreLabel token  = tokens.get(0) as CoreLabel;

                // Lemma
                string lemma = token.get(_lemmaAnnotation.getClass()).ToString();

                return(lemma);
            }
            catch (Exception)
            {
                return(null);
            }
        }
예제 #2
0
        /// <summary>Return a new <see cref="ParseResult"/> constructed from <paramref name="annotation"/></summary>
        internal ParseResult(Annotation annotation)
        {
            java.util.AbstractList sentences = annotation.get(SentencesAnnotationClass) as java.util.AbstractList;
            CoreMap sentence = sentences.get(0) as CoreMap;
            LabeledScoredTreeNode constituencyParse = sentence.get(TreeAnnotationClass) as LabeledScoredTreeNode;
            // Skip the ROOT
            Tree childOfRoot = constituencyParse.firstChild();

            Constituents = childOfRoot;
            Constituents.indexLeaves();

            // Build the collection of tokens
            var parsedTokens = sentence.get(TokensAnnotationClass) as java.util.AbstractList;
            var mentions     = sentence.get(MentionsAnnotationClass);

            for (int tokenIndex = 0; tokenIndex < parsedTokens.size(); tokenIndex++)
            {
                CoreLabel source        = parsedTokens.get(tokenIndex) as CoreLabel;
                var       tokenMentions = source.get(MentionTokenAnnotationClass);
                var       tokenGender   = source.get(GenderAnnotationClass);
                Tokens.Add(new ParseToken
                {
                    Index            = source.index(),
                    Word             = source.word(),
                    Lemma            = source.lemma(),
                    PartOfSpeech     = source.get(PartOfSpeechAnnotationClass) as string,
                    NamedEntityClass = source.get(NamedEntityTagAnnotationClass) as string,
                });
            }

            // Create the list of dependencies between tokens
            SemanticGraph dependencyGraph = sentence.get(DependencyAnnotationClass) as SemanticGraph;

            //java.util.List dependencies = dependencyGraph.edgeListSorted();
            java.util.Iterator dependencyGraphEdges = dependencyGraph.edgeIterable().iterator();
            while (dependencyGraphEdges.hasNext())
            {
                SemanticGraphEdge edge = dependencyGraphEdges.next() as SemanticGraphEdge;

                string      relationName      = edge.getRelation().getShortName();
                string      relationSpecifier = edge.getRelation().getSpecific();
                IndexedWord governor          = edge.getGovernor();
                IndexedWord dependent         = edge.getDependent();

                Dependencies.Add((relationName, relationSpecifier, governor.index(), dependent.index()));
            }
        }
예제 #3
0
        //
        public void Process(ref RecipeItem rec)
        {
            string text = rec.Value.ToLower();
            // Annotation
            var annotation = new Annotation(text);

            pipeline.annotate(annotation);

            // these are all the sentences in this document
            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
            var sentences = annotation.get(typeof(CoreAnnotations.SentencesAnnotation));

            if (sentences == null)
            {
                return;
            }


            var adj  = "";
            var noun = "";

            foreach (Annotation sentence in sentences as ArrayList)
            {
                //var token = sentence.get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                var       token = sentence.get(typeof(CoreAnnotations.TokensAnnotation));
                CoreLabel prev  = new CoreLabel();
                CoreLabel next;
                bool      isNote = false;
                foreach (CoreLabel typ in token as ArrayList)
                {
                    object word = typ.get(typeof(CoreAnnotations.TextAnnotation));
                    var    pos  = typ.get(typeof(CoreAnnotations.PartOfSpeechAnnotation));

                    Console.WriteLine("type: {0}, word: {1}", pos, word);
                    string test = pos.ToString().ToLower();
                    if (isNote)
                    {
                        rec.Notes += " " + word;
                    }

                    if (test.Contains(","))
                    {
                        isNote = true;
                    }



                    if (test.Contains("jj"))
                    {
                        adj += " " + word;
                    }

                    if (test.Contains("nn"))
                    {
                        noun += " " + word;
                    }

                    if (prev.value() != null)
                    {
                        word = prev.get(typeof(CoreAnnotations.TextAnnotation));
                        pos  = prev.get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                    }

                    prev = typ;
                }
            }
            Console.WriteLine("\n");
            rec.Adj  = adj;
            rec.Noun = noun;
        }
예제 #4
0
        private static int tokReader(TextReader r, TextWriter writer, Regex parseInsidePattern, String options, bool preserveLines, bool dump, bool lowerCase) /*throws IOException*/
        {
            int     numTokens = 0;
            bool    beginLine = true;
            bool    printing  = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity
            Matcher m         = null;

            if (parseInsidePattern != null)
            {
                m = parseInsidePattern.matcher(""); // create once as performance hack
            }
            for (PTBTokenizer <CoreLabel> tokenizer = new PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext();)
            {
                CoreLabel obj = tokenizer.next();
                // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
                String origStr = (String)obj.get(typeof(CoreAnnotations.TextAnnotation));
                String str;
                if (lowerCase)
                {
                    str = origStr.ToLower(Locale.ENGLISH);
                    obj.set(typeof(CoreAnnotations.TextAnnotation), str);
                }
                else
                {
                    str = origStr;
                }
                if (m != null && m.reset(origStr).matches())
                {
                    printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is
                }
                else if (printing)
                {
                    if (dump)
                    {
                        // after having checked for tags, change str to be exhaustive
                        str = obj.toString();
                    }
                    if (preserveLines)
                    {
                        if (PTBLexer.NEWLINE_TOKEN.equals(origStr))
                        {
                            beginLine = true;
                            writer.newLine();
                        }
                        else
                        {
                            if (!beginLine)
                            {
                                writer.Write(' ');
                            }
                            else
                            {
                                beginLine = false;
                            }
                            // writer.Write(str.replace("\n", ""));
                            writer.Write(str);
                        }
                    }
                    else
                    {
                        writer.Write(str);
                        writer.newLine();
                    }
                }
                numTokens++;
            }
            return(numTokens);
        }