public static DepPattern PatternToDepPattern(Pair <IndexedWord, GrammaticalRelation> p, DataInstance sent)
        {
            Token     token        = new Token(PatternFactory.PatternType.Dep);
            CoreLabel backingLabel = sent.GetTokens()[p.First().Index() - 1];

            System.Diagnostics.Debug.Assert(backingLabel.ContainsKey(typeof(PatternsAnnotations.ProcessedTextAnnotation)), "the keyset are " + backingLabel.ToString(CoreLabel.OutputFormat.All));
            token.AddORRestriction(typeof(PatternsAnnotations.ProcessedTextAnnotation), backingLabel.Get(typeof(PatternsAnnotations.ProcessedTextAnnotation)));
            return(new DepPattern(token, p.Second()));
        }
예제 #2
0
        internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj)
        {
            Token  strgeneric  = new Token(PatternFactory.PatternType.Surface);
            string strOriginal = string.Empty;
            bool   isLabeledO  = true;

            //    for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
            //      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
            //        isLabeledO = false;
            //        if (strOriginal.isEmpty()) {
            //          strOriginal = e.getKey();
            //        } else {
            //          strOriginal += "|" + e.getKey();
            //        }
            //        strgeneric.addRestriction(e.getKey(), e.getKey());
            //      }
            //    }
            foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses())
            {
                if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null)
                {
                    throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All));
                }
                if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = e.Key;
                    }
                    else
                    {
                        strOriginal += "|" + e.Key;
                    }
                    strgeneric.AddORRestriction(e.Value, e.Key);
                }
            }
            if (useContextNERRestriction)
            {
                string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = nerTag;
                    }
                    else
                    {
                        strOriginal += "|" + nerTag;
                    }
                    strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag);
                }
            }
            return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal));
        }
 /// <summary>
 /// Create a new search problem instance, given a sentence (possibly fragment), and the corresponding
 /// parse tree.
 /// </summary>
 /// <param name="parseTree">The original tree of the sentence we are beginning with</param>
 /// <param name="truthOfPremise">The truth of the premise. In most applications, this will just be true.</param>
 /// <returns>A new search problem instance.</returns>
 public virtual ForwardEntailerSearchProblem Apply(SemanticGraph parseTree, bool truthOfPremise)
 {
     foreach (IndexedWord vertex in parseTree.VertexSet())
     {
         CoreLabel token = vertex.BackingLabel();
         if (token != null && !token.ContainsKey(typeof(NaturalLogicAnnotations.PolarityAnnotation)))
         {
             throw new ArgumentException("Cannot run Natural Logic forward entailment without polarity annotations set. See " + typeof(NaturalLogicAnnotator).GetSimpleName());
         }
     }
     return(new ForwardEntailerSearchProblem(parseTree, truthOfPremise, maxResults, maxTicks, weights));
 }
예제 #4
0
        /// <summary>Create a datum from a string.</summary>
        /// <remarks>
        /// Create a datum from a string. The CoreAnnotations must correspond to those used by
        /// SequenceClassifier. The following annotations are copied from the provided
        /// CoreLabel cl, if present:
        /// DomainAnnotation
        /// startOffset and endOffset will be added to the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// of
        /// the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/>
        /// cl to give the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// and
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/>
        /// of the resulting datum.
        /// </remarks>
        private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset)
        {
            CoreLabel newTok = new CoreLabel();

            newTok.Set(typeof(CoreAnnotations.TextAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.CharAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset);
            if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation)))
            {
                newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation)));
            }
            return(newTok);
        }
예제 #5
0
 protected internal override T GetNext()
 {
     try
     {
         T nextToken;
         do
         {
             // initialized in do-while
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = (splitAny && !compoundBuffer.IsEmpty()) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().IsEmpty());
         // Check for compounds to split
         if (splitAny && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)))
             {
                 if (splitCompounds && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.CompoundAnnotation))
                 {
                     nextToken = (T)ProcessCompound(cl);
                 }
                 else
                 {
                     if (splitVerbs && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.VbPronAnnotation))
                     {
                         nextToken = (T)ProcessVerb(cl);
                     }
                     else
                     {
                         if (splitContractions && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.ContrAnnotation))
                         {
                             nextToken = (T)ProcessContraction(cl);
                         }
                     }
                 }
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }
예제 #6
0
        private static Tree FindTreeWithSpan(Tree tree, int start, int end)
        {
            CoreLabel l = (CoreLabel)tree.Label();

            if (l != null && l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation)))
            {
                int myStart = l.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int myEnd   = l.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                if (start == myStart && end == myEnd)
                {
                    // found perfect match
                    return(tree);
                }
                else
                {
                    if (end < myStart)
                    {
                        return(null);
                    }
                    else
                    {
                        if (start >= myEnd)
                        {
                            return(null);
                        }
                    }
                }
            }
            // otherwise, check inside children - a match is possible
            foreach (Tree kid in tree.Children())
            {
                if (kid == null)
                {
                    continue;
                }
                Tree ret = FindTreeWithSpan(kid, start, end);
                // found matching child
                if (ret != null)
                {
                    return(ret);
                }
            }
            // no match
            return(null);
        }
예제 #7
0
 protected internal override T GetNext()
 {
     try
     {
         T nextToken = null;
         do
         {
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = ((splitContractions || splitCompounds) && compoundBuffer.Count > 0) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().Length == 0);
         // Check for compounds to split
         if (splitCompounds && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.CompoundAnnotation))
             {
                 nextToken = (T)ProcessCompound(cl);
             }
         }
         // Check for contractions to split
         if (splitContractions && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.ContrAnnotation))
             {
                 nextToken = (T)ProcessContraction(cl);
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }
예제 #10
0
        /// <summary>Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary)</summary>
        public virtual void PreProcessSentences(Annotation dataset)
        {
            logger.Severe("GenericDataSetReader: Started pre-processing the corpus...");
            // run the processor, i.e., NER, parse etc.
            if (processor != null)
            {
                // we might already have syntactic annotation from offline files
                IList <ICoreMap> sentences = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));
                if (sentences.Count > 0 && !sentences[0].ContainsKey(typeof(TreeCoreAnnotations.TreeAnnotation)))
                {
                    logger.Info("Annotating dataset with " + processor);
                    processor.Annotate(dataset);
                }
                else
                {
                    logger.Info("Found existing syntactic annotations. Will not use the NLP processor.");
                }
            }

            /*
             * List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class);
             * for(int i = 0; i < sentences.size(); i ++){
             * CoreMap sent = sentences.get(i);
             * List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
             * logger.info("Tokens for sentence #" + i + ": " + tokens);
             * logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString());
             * }
             */
            IList <ICoreMap> sentences_1 = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation));

            logger.Fine("Extracted " + sentences_1.Count + " sentences.");
            foreach (ICoreMap sentence in sentences_1)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                logger.Fine("Processing sentence " + tokens);
                Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                if (tree == null)
                {
                    throw new Exception("ERROR: MR requires full syntactic analysis!");
                }
                // convert tree labels to CoreLabel if necessary
                // we need this because we store additional info in the CoreLabel, such as the spans of each tree
                ConvertToCoreLabels(tree);
                // store the tree spans, if not present already
                CoreLabel l = (CoreLabel)tree.Label();
                if (forceGenerationOfIndexSpans || (!l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && !l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation))))
                {
                    tree.IndexSpans(0);
                    logger.Fine("Index spans were generated.");
                }
                else
                {
                    logger.Fine("Index spans were NOT generated.");
                }
                logger.Fine("Parse tree using CoreLabel:\n" + tree.PennString());
                //
                // now match all entity mentions against the syntactic tree
                //
                if (sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)) != null)
                {
                    foreach (EntityMention ent in sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)))
                    {
                        logger.Fine("Finding head for entity: " + ent);
                        int headPos = AssignSyntacticHead(ent, tree, tokens, calculateHeadSpan);
                        logger.Fine("Syntactic head of mention \"" + ent + "\" is: " + tokens[headPos].Word());
                        System.Diagnostics.Debug.Assert((ent.GetExtent() != null));
                        System.Diagnostics.Debug.Assert((ent.GetHead() != null));
                        System.Diagnostics.Debug.Assert((ent.GetSyntacticHeadTokenPosition() >= 0));
                    }
                }
            }
            logger.Severe("GenericDataSetReader: Pre-processing complete.");
        }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
            TwoDimensionalCounter <Pair <string, string>, E>    allFreq            = new TwoDimensionalCounter <Pair <string, string>, E>();

            foreach (string sentid in sentids)
            {
                IList <CoreLabel> sent = sents[sentid].GetTokens();
                //FIND_ALL is faster than FIND_NONOVERLAP
                IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll);
                foreach (ISequenceMatchResult <ICoreMap> m in matched)
                {
                    int s          = m.Start("$term");
                    int e          = m.End("$term");
                    E   matchedPat = patterns[m.Pattern()];
                    matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e));
                    string phrase            = string.Empty;
                    string phraseLemma       = string.Empty;
                    bool   useWordNotLabeled = false;
                    bool   doNotUse          = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords)
                    {
                        for (int i = s - 1; i >= 0; i--)
                        {
                            if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i_1 = e; i_1 < sent.Count; i_1++)
                        {
                            if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                e = i_1;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    bool[] addedindices = new bool[e - s];
                    // Arrays.fill(addedindices, false); // unneeded as done on initialization
                    for (int i_2 = s; i_2 < e; i_2++)
                    {
                        CoreLabel l = sent[i_2];
                        l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                        if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)))
                        {
                            l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                        }
                        l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat);
                        // if (restrictToMatched) {
                        // tokensMatchedPattern.add(sentid, i);
                        // }
                        foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                        {
                            if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                            {
                                doNotUse = true;
                            }
                        }
                        bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop)
                        {
                            doNotUse = true;
                        }
                        else
                        {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases)
                            {
                                if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString()))
                                {
                                    useWordNotLabeled = true;
                                }
                                phrase               += " " + l.Word();
                                phraseLemma          += " " + l.Lemma();
                                addedindices[i_2 - s] = true;
                            }
                        }
                    }
                    for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                    {
                        if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                        {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse && useWordNotLabeled)
                    {
                        phrase      = phrase.Trim();
                        phraseLemma = phraseLemma.Trim();
                        allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0);
                    }
                }
            }
            //      for (SurfacePattern pat : patterns.keySet()) {
            //        String patternStr = pat.toString();
            //
            //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
            //        if (pat == null || p == null)
            //          throw new RuntimeException("why is the pattern " + pat + " null?");
            //
            //        TokenSequenceMatcher m = p.getMatcher(sent);
            //        while (m.find()) {
            //
            //          int s = m.start("$term");
            //          int e = m.end("$term");
            //
            //          String phrase = "";
            //          String phraseLemma = "";
            //          boolean useWordNotLabeled = false;
            //          boolean doNotUse = false;
            //          for (int i = s; i < e; i++) {
            //            CoreLabel l = sent.get(i);
            //            l.set(PatternsAnnotations.MatchedPattern.class, true);
            //            if (restrictToMatched) {
            //              tokensMatchedPattern.add(sentid, i);
            //            }
            //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
            //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
            //                doNotUse = true;
            //              }
            //            }
            //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
            //            if (removePhrasesWithStopWords && containsStop) {
            //              doNotUse = true;
            //            } else {
            //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
            //
            //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
            //                  useWordNotLabeled = true;
            //                }
            //                phrase += " " + l.word();
            //                phraseLemma += " " + l.lemma();
            //
            //              }
            //            }
            //          }
            //          if (!doNotUse && useWordNotLabeled) {
            //            phrase = phrase.trim();
            //            phraseLemma = phraseLemma.trim();
            //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
            //          }
            //        }
            //      }
            return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }