C# (CSharp) CoreLabel.ContainsKey 예제들

프로그래밍 언어: C# (CSharp)

클래스/타입: CoreLabel

메소드/함수: ContainsKey

hotexamples.com에서의 예제들: 11

C# (CSharp) CoreLabel.ContainsKey - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 CoreLabel.ContainsKey에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Set(30)

Word(30)

Get(30)

SetWord(27)

SetValue(27)

Lemma(15)

SetTag(13)

Tag(12)

ContainsKey(11)

GetString(10)

Index(9)

SetIndex(9)

Value(9)

Factory(9)

Remove(8)

SetNER(7)

BeginPosition(6)

SetLemma(6)

SetOriginalText(5)

ToString(4)

get(4)

SetBeginPosition(4)

OriginalText(4)

SetEndPosition(4)

KeySet(3)

ToShorterString(3)

value(2)

Ner(2)

IsNewline(2)

EndPosition(2)

toString(1)

set(1)

lemma(1)

index(1)

SetCategory(1)

endPosition(1)

beginPosition(1)

GetHashCode(1)

Size(1)

LabelFactory(1)

Category(1)

word(1)

예제 #1

파일 보기

파일: DepPatternFactory.cs 프로젝트: zerouid/Stanford.CoreNLP.NET

        public static DepPattern PatternToDepPattern(Pair <IndexedWord, GrammaticalRelation> p, DataInstance sent)
        {
            Token     token        = new Token(PatternFactory.PatternType.Dep);
            CoreLabel backingLabel = sent.GetTokens()[p.First().Index() - 1];

            System.Diagnostics.Debug.Assert(backingLabel.ContainsKey(typeof(PatternsAnnotations.ProcessedTextAnnotation)), "the keyset are " + backingLabel.ToString(CoreLabel.OutputFormat.All));
            token.AddORRestriction(typeof(PatternsAnnotations.ProcessedTextAnnotation), backingLabel.Get(typeof(PatternsAnnotations.ProcessedTextAnnotation)));
            return(new DepPattern(token, p.Second()));
        }

예제 #2

파일 보기

        internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj)
        {
            Token  strgeneric  = new Token(PatternFactory.PatternType.Surface);
            string strOriginal = string.Empty;
            bool   isLabeledO  = true;

            //    for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
            //      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
            //        isLabeledO = false;
            //        if (strOriginal.isEmpty()) {
            //          strOriginal = e.getKey();
            //        } else {
            //          strOriginal += "|" + e.getKey();
            //        }
            //        strgeneric.addRestriction(e.getKey(), e.getKey());
            //      }
            //    }
            foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses())
            {
                if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null)
                {
                    throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All));
                }
                if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = e.Key;
                    }
                    else
                    {
                        strOriginal += "|" + e.Key;
                    }
                    strgeneric.AddORRestriction(e.Value, e.Key);
                }
            }
            if (useContextNERRestriction)
            {
                string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = nerTag;
                    }
                    else
                    {
                        strOriginal += "|" + nerTag;
                    }
                    strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag);
                }
            }
            return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal));
        }

예제 #3

파일 보기

파일: ForwardEntailer.cs 프로젝트: awesomedotnetcore/Stanford.CoreNLP.NET

 /// <summary>
 /// Create a new search problem instance, given a sentence (possibly fragment), and the corresponding
 /// parse tree.
 /// </summary>
 /// <param name="parseTree">The original tree of the sentence we are beginning with</param>
 /// <param name="truthOfPremise">The truth of the premise. In most applications, this will just be true.</param>
 /// <returns>A new search problem instance.</returns>
 public virtual ForwardEntailerSearchProblem Apply(SemanticGraph parseTree, bool truthOfPremise)
 {
     foreach (IndexedWord vertex in parseTree.VertexSet())
     {
         CoreLabel token = vertex.BackingLabel();
         if (token != null && !token.ContainsKey(typeof(NaturalLogicAnnotations.PolarityAnnotation)))
         {
             throw new ArgumentException("Cannot run Natural Logic forward entailment without polarity annotations set. See " + typeof(NaturalLogicAnnotator).GetSimpleName());
         }
     }
     return(new ForwardEntailerSearchProblem(parseTree, truthOfPremise, maxResults, maxTicks, weights));
 }

예제 #4

파일 보기

파일: IOBUtils.cs 프로젝트: zerouid/Stanford.CoreNLP.NET

        /// <summary>Create a datum from a string.</summary>
        /// <remarks>
        /// Create a datum from a string. The CoreAnnotations must correspond to those used by
        /// SequenceClassifier. The following annotations are copied from the provided
        /// CoreLabel cl, if present:
        /// DomainAnnotation
        /// startOffset and endOffset will be added to the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// of
        /// the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/>
        /// cl to give the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// and
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/>
        /// of the resulting datum.
        /// </remarks>
        private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset)
        {
            CoreLabel newTok = new CoreLabel();

            newTok.Set(typeof(CoreAnnotations.TextAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.CharAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset);
            if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation)))
            {
                newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation)));
            }
            return(newTok);
        }

예제 #5

파일 보기

 protected internal override T GetNext()
 {
     try
     {
         T nextToken;
         do
         {
             // initialized in do-while
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = (splitAny && !compoundBuffer.IsEmpty()) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().IsEmpty());
         // Check for compounds to split
         if (splitAny && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)))
             {
                 if (splitCompounds && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.CompoundAnnotation))
                 {
                     nextToken = (T)ProcessCompound(cl);
                 }
                 else
                 {
                     if (splitVerbs && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.VbPronAnnotation))
                     {
                         nextToken = (T)ProcessVerb(cl);
                     }
                     else
                     {
                         if (splitContractions && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.ContrAnnotation))
                         {
                             nextToken = (T)ProcessContraction(cl);
                         }
                     }
                 }
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }

예제 #6

파일 보기

        private static Tree FindTreeWithSpan(Tree tree, int start, int end)
        {
            CoreLabel l = (CoreLabel)tree.Label();

            if (l != null && l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation)))
            {
                int myStart = l.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int myEnd   = l.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                if (start == myStart && end == myEnd)
                {
                    // found perfect match
                    return(tree);
                }
                else
                {
                    if (end < myStart)
                    {
                        return(null);
                    }
                    else
                    {
                        if (start >= myEnd)
                        {
                            return(null);
                        }
                    }
                }
            }
            // otherwise, check inside children - a match is possible
            foreach (Tree kid in tree.Children())
            {
                if (kid == null)
                {
                    continue;
                }
                Tree ret = FindTreeWithSpan(kid, start, end);
                // found matching child
                if (ret != null)
                {
                    return(ret);
                }
            }
            // no match
            return(null);
        }

예제 #7

파일 보기

 protected internal override T GetNext()
 {
     try
     {
         T nextToken = null;
         do
         {
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = ((splitContractions || splitCompounds) && compoundBuffer.Count > 0) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().Length == 0);
         // Check for compounds to split
         if (splitCompounds && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.CompoundAnnotation))
             {
                 nextToken = (T)ProcessCompound(cl);
             }
         }
         // Check for contractions to split
         if (splitContractions && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.ContrAnnotation))
             {
                 nextToken = (T)ProcessContraction(cl);
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }

예제 #8

파일 보기

파일: ApplyPatterns.cs 프로젝트: awesomedotnetcore/Stanford.CoreNLP.NET

 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }

예제 #9

파일 보기

파일: ApplyDepPatterns.cs 프로젝트: zerouid/Stanford.CoreNLP.NET

        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }

예제 #10

파일 보기

        /// <summary>Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary)</summary>
        public virtual void PreProcessSentences(Annotation dataset)
        {
            logger.Severe("GenericDataSetReader: Started pre-processing the corpus...");
            // run the processor, i.e., NER, parse etc.
            if (processor != null)
            {
                // we might already have syntactic annotation from offline files
                IList <ICoreMap> sentences = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));
                if (sentences.Count > 0 && !sentences[0].ContainsKey(typeof(TreeCoreAnnotations.TreeAnnotation)))
                {
                    logger.Info("Annotating dataset with " + processor);
                    processor.Annotate(dataset);
                }
                else
                {
                    logger.Info("Found existing syntactic annotations. Will not use the NLP processor.");
                }
            }

            /*
             * List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
             * for(int i = 0; i < sentences.size(); i ++){
             * CoreMap sent = sentences.get(i);
             * List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
             * logger.info("Tokens for sentence #" + i + ": " + tokens);
             * logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString());
             * }
             */
            IList <ICoreMap> sentences_1 = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));

            logger.Fine("Extracted " + sentences_1.Count + " sentences.");
            foreach (ICoreMap sentence in sentences_1)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                logger.Fine("Processing sentence " + tokens);
                Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                if (tree == null)
                {
                    throw new Exception("ERROR: MR requires full syntactic analysis!");
                }
                // convert tree labels to CoreLabel if necessary
                // we need this because we store additional info in the CoreLabel, such as the spans of each tree
                ConvertToCoreLabels(tree);
                // store the tree spans, if not present already
                CoreLabel l = (CoreLabel)tree.Label();
                if (forceGenerationOfIndexSpans || (!l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && !l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation))))
                {
                    tree.IndexSpans(0);
                    logger.Fine("Index spans were generated.");
                }
                else
                {
                    logger.Fine("Index spans were NOT generated.");
                }
                logger.Fine("Parse tree using CoreLabel:\n" + tree.PennString());
                //
                // now match all entity mentions against the syntactic tree
                //
                if (sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)) != null)
                {
                    foreach (EntityMention ent in sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)))
                    {
                        logger.Fine("Finding head for entity: " + ent);
                        int headPos = AssignSyntacticHead(ent, tree, tokens, calculateHeadSpan);
                        logger.Fine("Syntactic head of mention \"" + ent + "\" is: " + tokens[headPos].Word());
                        System.Diagnostics.Debug.Assert((ent.GetExtent() != null));
                        System.Diagnostics.Debug.Assert((ent.GetHead() != null));
                        System.Diagnostics.Debug.Assert((ent.GetSyntacticHeadTokenPosition() >= 0));
                    }
                }
            }
            logger.Severe("GenericDataSetReader: Pre-processing complete.");
        }

예제 #11

파일 보기

파일: ApplyPatternsMulti.cs 프로젝트: zerouid/Stanford.CoreNLP.NET

        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
            TwoDimensionalCounter <Pair <string, string>, E>    allFreq            = new TwoDimensionalCounter <Pair <string, string>, E>();

            foreach (string sentid in sentids)
            {
                IList <CoreLabel> sent = sents[sentid].GetTokens();
                //FIND_ALL is faster than FIND_NONOVERLAP
                IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll);
                foreach (ISequenceMatchResult <ICoreMap> m in matched)
                {
                    int s          = m.Start("$term");
                    int e          = m.End("$term");
                    E   matchedPat = patterns[m.Pattern()];
                    matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e));
                    string phrase            = string.Empty;
                    string phraseLemma       = string.Empty;
                    bool   useWordNotLabeled = false;
                    bool   doNotUse          = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords)
                    {
                        for (int i = s - 1; i >= 0; i--)
                        {
                            if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i_1 = e; i_1 < sent.Count; i_1++)
                        {
                            if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                e = i_1;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    bool[] addedindices = new bool[e - s];
                    // Arrays.fill(addedindices, false); // unneeded as done on initialization
                    for (int i_2 = s; i_2 < e; i_2++)
                    {
                        CoreLabel l = sent[i_2];
                        l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                        if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)))
                        {
                            l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                        }
                        l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat);
                        // if (restrictToMatched) {
                        // tokensMatchedPattern.add(sentid, i);
                        // }
                        foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                        {
                            if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                            {
                                doNotUse = true;
                            }
                        }
                        bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop)
                        {
                            doNotUse = true;
                        }
                        else
                        {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases)
                            {
                                if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString()))
                                {
                                    useWordNotLabeled = true;
                                }
                                phrase               += " " + l.Word();
                                phraseLemma          += " " + l.Lemma();
                                addedindices[i_2 - s] = true;
                            }
                        }
                    }
                    for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                    {
                        if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                        {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse && useWordNotLabeled)
                    {
                        phrase      = phrase.Trim();
                        phraseLemma = phraseLemma.Trim();
                        allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0);
                    }
                }
            }
            //      for (SurfacePattern pat : patterns.keySet()) {
            //        String patternStr = pat.toString();
            //
            //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
            //        if (pat == null || p == null)
            //          throw new RuntimeException("why is the pattern " + pat + " null?");
            //
            //        TokenSequenceMatcher m = p.getMatcher(sent);
            //        while (m.find()) {
            //
            //          int s = m.start("$term");
            //          int e = m.end("$term");
            //
            //          String phrase = "";
            //          String phraseLemma = "";
            //          boolean useWordNotLabeled = false;
            //          boolean doNotUse = false;
            //          for (int i = s; i < e; i++) {
            //            CoreLabel l = sent.get(i);
            //            l.set(PatternsAnnotations.MatchedPattern.class, true);
            //            if (restrictToMatched) {
            //              tokensMatchedPattern.add(sentid, i);
            //            }
            //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
            //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
            //                doNotUse = true;
            //              }
            //            }
            //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
            //            if (removePhrasesWithStopWords && containsStop) {
            //              doNotUse = true;
            //            } else {
            //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
            //
            //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
            //                  useWordNotLabeled = true;
            //                }
            //                phrase += " " + l.word();
            //                phraseLemma += " " + l.lemma();
            //
            //              }
            //            }
            //          }
            //          if (!doNotUse && useWordNotLabeled) {
            //            phrase = phrase.trim();
            //            phraseLemma = phraseLemma.trim();
            //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
            //          }
            //        }
            //      }
            return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }