private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
internal static ICollection <DepPattern> GetContext(IndexedWord w, SemanticGraph graph, ICollection <CandidatePhrase> stopWords, DataInstance sent) { ICollection <DepPattern> patterns = new HashSet <DepPattern>(); IndexedWord node = w; int depth = 1; while (depth <= upDepth) { IndexedWord parent = graph.GetParent(node); if (parent == null) { break; } GrammaticalRelation rel = graph.Reln(parent, node); foreach (Pattern tagPattern in allowedTagPatternForTrigger) { if (tagPattern.Matcher(parent.Tag()).Matches()) { if (!IfIgnoreRel(rel) && !stopWords.Contains(CandidatePhrase.CreateOrGet(parent.Word())) && parent.Word().Length > 1) { Pair <IndexedWord, GrammaticalRelation> pattern = new Pair <IndexedWord, GrammaticalRelation>(parent, rel); DepPattern patterndep = PatternToDepPattern(pattern, sent); if (depth <= upDepth) { patterns.Add(patterndep); } } } } // if (depth <= maxDepth) { // Counter<String> phrasesForPattern = phrasesForPatternForSent.get(patternStr); // if (phrasesForPattern == null) // phrasesForPattern = new ClassicCounter<String>(); // phrasesForPattern.incrementCount(phrase); // phrasesForPatternForSent.put(patternStr, phrasesForPattern); // } // if (DEBUG >= 1) // System.out.println("for phrase " + phrase + " pattern is " + patternStr); node = parent; depth++; } return(patterns); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { DataInstance sent = sents[sentid]; IList <CoreLabel> tokens = sent.GetTokens(); foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } SemanticGraph graph = ((DataInstanceDep)sent).GetGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label); foreach (ExtractedPhrase match in matched) { int s = match.startIndex; int e = match.endIndex + 1; string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label]) { s = i; } else { //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); break; } } for (int i_1 = e; i_1 < tokens.Count; i_1++) { if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label]) { e = i_1; } else { //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = tokens[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } Pattern pSur = pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0); } } } } } return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }