Пример #1
0
        /// <summary>see merge(CoreMap base, CoreMap toBeMerged)</summary>
        public static CoreLabel Merge(CoreLabel @base, CoreLabel toBeMerged)
        {
            //(variables)
            CoreLabel rtn = new CoreLabel(@base.Size());

            //(copy base)
            foreach (Type key in @base.KeySet())
            {
                rtn.Set(key, @base.Get(key));
            }
            //(merge)
            foreach (Type key_1 in toBeMerged.KeySet())
            {
                rtn.Set(key_1, toBeMerged.Get(key_1));
            }
            //(return)
            return(rtn);
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }