public static DepPattern PatternToDepPattern(Pair <IndexedWord, GrammaticalRelation> p, DataInstance sent)
        {
            Token     token        = new Token(PatternFactory.PatternType.Dep);
            CoreLabel backingLabel = sent.GetTokens()[p.First().Index() - 1];

            System.Diagnostics.Debug.Assert(backingLabel.ContainsKey(typeof(PatternsAnnotations.ProcessedTextAnnotation)), "the keyset are " + backingLabel.ToString(CoreLabel.OutputFormat.All));
            token.AddORRestriction(typeof(PatternsAnnotations.ProcessedTextAnnotation), backingLabel.Get(typeof(PatternsAnnotations.ProcessedTextAnnotation)));
            return(new DepPattern(token, p.Second()));
        }
        private ICollection <ExtractedPhrase> GetMatchedTokensIndex(SemanticGraph graph, SemgrexPattern pattern, DataInstance sent, string label)
        {
            //TODO: look at the ignoreCommonTags flag
            ExtractPhraseFromPattern extract       = new ExtractPhraseFromPattern(false, PatternFactory.numWordsCompoundMapped[label]);
            ICollection <IntPair>    outputIndices = new List <IntPair>();
            bool findSubTrees         = true;
            IList <CoreLabel> tokensC = sent.GetTokens();
            //TODO: see if you can get rid of this (only used for matchedGraphs)
            IList <string>          tokens           = tokensC.Stream().Map(null).Collect(Collectors.ToList());
            IList <string>          outputPhrases    = new List <string>();
            IList <ExtractedPhrase> extractedPhrases = new List <ExtractedPhrase>();
            Func <Pair <IndexedWord, SemanticGraph>, ICounter <string> > extractFeatures = new _IFunction_206();

            //TODO: make features;
            extract.GetSemGrexPatternNodes(graph, tokens, outputPhrases, outputIndices, pattern, findSubTrees, extractedPhrases, constVars.matchLowerCaseContext, matchingWordRestriction);

            /*
             * //TODO: probably a bad idea to add ALL ngrams
             * Collection<ExtractedPhrase> outputIndicesMaxPhraseLen = new ArrayList<ExtractedPhrase>();
             * for(IntPair o: outputIndices){
             * int min = o.get(0);
             * int max = o.get(1);
             *
             * for (int i = min; i <= max ; i++) {
             *
             * CoreLabel t = tokensC.get(i);
             * String phrase = t.word();
             * if(!matchedRestriction(t, label))
             * continue;
             * for (int ngramSize = 1; ngramSize < PatternFactory.numWordsCompound; ++ngramSize) {
             * int j = i + ngramSize - 1;
             * if(j > max)
             * break;
             *
             * CoreLabel tokenj = tokensC.get(j);
             *
             * if(ngramSize > 1)
             * phrase += " " + tokenj.word();
             *
             * if (matchedRestriction(tokenj, label)) {
             * outputIndicesMaxPhraseLen.add(new ExtractedPhrase(i, j, phrase));
             * //outputIndicesMaxPhraseLen.add(new IntPair(i, j));
             * }
             * }
             * }
             * }*/
            //System.out.println("extracted phrases are " + extractedPhrases + " and output indices are " + outputIndices);
            return(extractedPhrases);
        }
Exemplo n.º 3
0
        // && !text.contains("+") &&
        // !text.contains("*");// && !
        // text.contains("$") && !text.contains("\"");
        public static IDictionary <int, ISet> GetPatternsAroundTokens(DataInstance sent, ICollection <CandidatePhrase> stopWords)
        {
            IDictionary <int, ISet> p      = new Dictionary <int, ISet>();
            IList <CoreLabel>       tokens = sent.GetTokens();

            for (int i = 0; i < tokens.Count; i++)
            {
                //          p.put(
                //              i,
                //              new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
                //                  new HashSet<Integer>(), new HashSet<Integer>(),
                //                  new HashSet<Integer>()));
                p[i] = new HashSet <SurfacePattern>();
                CoreLabel token = tokens[i];
                // do not create patterns around stop words!
                if (PatternFactory.DoNotUse(token.Word(), stopWords))
                {
                    continue;
                }
                ICollection <SurfacePattern> pat = GetContext(sent.GetTokens(), i, stopWords);
                p[i] = pat;
            }
            return(p);
        }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }