public static DepPattern PatternToDepPattern(Pair <IndexedWord, GrammaticalRelation> p, DataInstance sent) { Token token = new Token(PatternFactory.PatternType.Dep); CoreLabel backingLabel = sent.GetTokens()[p.First().Index() - 1]; System.Diagnostics.Debug.Assert(backingLabel.ContainsKey(typeof(PatternsAnnotations.ProcessedTextAnnotation)), "the keyset are " + backingLabel.ToString(CoreLabel.OutputFormat.All)); token.AddORRestriction(typeof(PatternsAnnotations.ProcessedTextAnnotation), backingLabel.Get(typeof(PatternsAnnotations.ProcessedTextAnnotation))); return(new DepPattern(token, p.Second())); }
internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj) { Token strgeneric = new Token(PatternFactory.PatternType.Surface); string strOriginal = string.Empty; bool isLabeledO = true; // for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) { // if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) { // isLabeledO = false; // if (strOriginal.isEmpty()) { // strOriginal = e.getKey(); // } else { // strOriginal += "|" + e.getKey(); // } // strgeneric.addRestriction(e.getKey(), e.getKey()); // } // } foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses()) { if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null) { throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All)); } if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol)) { isLabeledO = false; if (strOriginal.IsEmpty()) { strOriginal = e.Key; } else { strOriginal += "|" + e.Key; } strgeneric.AddORRestriction(e.Value, e.Key); } } if (useContextNERRestriction) { string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol)) { isLabeledO = false; if (strOriginal.IsEmpty()) { strOriginal = nerTag; } else { strOriginal += "|" + nerTag; } strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag); } } return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal)); }
/// <summary> /// Create a new search problem instance, given a sentence (possibly fragment), and the corresponding /// parse tree. /// </summary> /// <param name="parseTree">The original tree of the sentence we are beginning with</param> /// <param name="truthOfPremise">The truth of the premise. In most applications, this will just be true.</param> /// <returns>A new search problem instance.</returns> public virtual ForwardEntailerSearchProblem Apply(SemanticGraph parseTree, bool truthOfPremise) { foreach (IndexedWord vertex in parseTree.VertexSet()) { CoreLabel token = vertex.BackingLabel(); if (token != null && !token.ContainsKey(typeof(NaturalLogicAnnotations.PolarityAnnotation))) { throw new ArgumentException("Cannot run Natural Logic forward entailment without polarity annotations set. See " + typeof(NaturalLogicAnnotator).GetSimpleName()); } } return(new ForwardEntailerSearchProblem(parseTree, truthOfPremise, maxResults, maxTicks, weights)); }
/// <summary>Create a datum from a string.</summary> /// <remarks> /// Create a datum from a string. The CoreAnnotations must correspond to those used by /// SequenceClassifier. The following annotations are copied from the provided /// CoreLabel cl, if present: /// DomainAnnotation /// startOffset and endOffset will be added to the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// of /// the /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/> /// cl to give the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// and /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/> /// of the resulting datum. /// </remarks> private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset) { CoreLabel newTok = new CoreLabel(); newTok.Set(typeof(CoreAnnotations.TextAnnotation), token); newTok.Set(typeof(CoreAnnotations.CharAnnotation), token); newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset); newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset); if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation))) { newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation))); } return(newTok); }
protected internal override T GetNext() { try { T nextToken; do { // initialized in do-while // Depending on the orthographic normalization options, // some tokens can be obliterated. In this case, keep iterating // until we see a non-zero length token. nextToken = (splitAny && !compoundBuffer.IsEmpty()) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next(); }while (nextToken != null && nextToken.Word().IsEmpty()); // Check for compounds to split if (splitAny && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation))) { if (splitCompounds && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.CompoundAnnotation)) { nextToken = (T)ProcessCompound(cl); } else { if (splitVerbs && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.VbPronAnnotation)) { nextToken = (T)ProcessVerb(cl); } else { if (splitContractions && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.ContrAnnotation)) { nextToken = (T)ProcessContraction(cl); } } } } } return(nextToken); } catch (IOException e) { throw new RuntimeIOException(e); } }
private static Tree FindTreeWithSpan(Tree tree, int start, int end) { CoreLabel l = (CoreLabel)tree.Label(); if (l != null && l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation))) { int myStart = l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); int myEnd = l.Get(typeof(CoreAnnotations.EndIndexAnnotation)); if (start == myStart && end == myEnd) { // found perfect match return(tree); } else { if (end < myStart) { return(null); } else { if (start >= myEnd) { return(null); } } } } // otherwise, check inside children - a match is possible foreach (Tree kid in tree.Children()) { if (kid == null) { continue; } Tree ret = FindTreeWithSpan(kid, start, end); // found matching child if (ret != null) { return(ret); } } // no match return(null); }
protected internal override T GetNext() { try { T nextToken = null; do { // Depending on the orthographic normalization options, // some tokens can be obliterated. In this case, keep iterating // until we see a non-zero length token. nextToken = ((splitContractions || splitCompounds) && compoundBuffer.Count > 0) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next(); }while (nextToken != null && nextToken.Word().Length == 0); // Check for compounds to split if (splitCompounds && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.CompoundAnnotation)) { nextToken = (T)ProcessCompound(cl); } } // Check for contractions to split if (splitContractions && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)) && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(FrenchLexer.ContrAnnotation)) { nextToken = (T)ProcessContraction(cl); } } return(nextToken); } catch (IOException e) { throw new RuntimeIOException(e); } }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { DataInstance sent = sents[sentid]; IList <CoreLabel> tokens = sent.GetTokens(); foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } SemanticGraph graph = ((DataInstanceDep)sent).GetGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label); foreach (ExtractedPhrase match in matched) { int s = match.startIndex; int e = match.endIndex + 1; string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label]) { s = i; } else { //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); break; } } for (int i_1 = e; i_1 < tokens.Count; i_1++) { if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label]) { e = i_1; } else { //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = tokens[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } Pattern pSur = pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0); } } } } } return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }
/// <summary>Take a dataset Annotation, generate their parse trees and identify syntactic heads (and head spans, if necessary)</summary> public virtual void PreProcessSentences(Annotation dataset) { logger.Severe("GenericDataSetReader: Started pre-processing the corpus..."); // run the processor, i.e., NER, parse etc. if (processor != null) { // we might already have syntactic annotation from offline files IList <ICoreMap> sentences = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation)); if (sentences.Count > 0 && !sentences[0].ContainsKey(typeof(TreeCoreAnnotations.TreeAnnotation))) { logger.Info("Annotating dataset with " + processor); processor.Annotate(dataset); } else { logger.Info("Found existing syntactic annotations. Will not use the NLP processor."); } } /* * List<CoreMap> sentences = dataset.get(CoreAnnotations.SentencesAnnotation.class); * for(int i = 0; i < sentences.size(); i ++){ * CoreMap sent = sentences.get(i); * List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class); * logger.info("Tokens for sentence #" + i + ": " + tokens); * logger.info("Parse tree for sentence #" + i + ": " + sent.get(TreeCoreAnnotations.TreeAnnotation.class).pennString()); * } */ IList <ICoreMap> sentences_1 = dataset.Get(typeof(CoreAnnotations.SentencesAnnotation)); logger.Fine("Extracted " + sentences_1.Count + " sentences."); foreach (ICoreMap sentence in sentences_1) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); logger.Fine("Processing sentence " + tokens); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); if (tree == null) { throw new Exception("ERROR: MR requires full syntactic analysis!"); } // convert tree labels to CoreLabel if necessary // we need this because we store additional info in the CoreLabel, such as the spans of each tree ConvertToCoreLabels(tree); // store the tree spans, if not present already CoreLabel l = (CoreLabel)tree.Label(); if (forceGenerationOfIndexSpans || (!l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && !l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation)))) { tree.IndexSpans(0); logger.Fine("Index spans were generated."); } else { logger.Fine("Index spans were NOT generated."); } logger.Fine("Parse tree using CoreLabel:\n" + tree.PennString()); // // now match all entity mentions against the syntactic tree // if (sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)) != null) { foreach (EntityMention ent in sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation))) { logger.Fine("Finding head for entity: " + ent); int headPos = AssignSyntacticHead(ent, tree, tokens, calculateHeadSpan); logger.Fine("Syntactic head of mention \"" + ent + "\" is: " + tokens[headPos].Word()); System.Diagnostics.Debug.Assert((ent.GetExtent() != null)); System.Diagnostics.Debug.Assert((ent.GetHead() != null)); System.Diagnostics.Debug.Assert((ent.GetSyntacticHeadTokenPosition() >= 0)); } } } logger.Severe("GenericDataSetReader: Pre-processing complete."); }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); TwoDimensionalCounter <Pair <string, string>, E> allFreq = new TwoDimensionalCounter <Pair <string, string>, E>(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); //FIND_ALL is faster than FIND_NONOVERLAP IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll); foreach (ISequenceMatchResult <ICoreMap> m in matched) { int s = m.Start("$term"); int e = m.End("$term"); E matchedPat = patterns[m.Pattern()]; matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e)); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // unneeded as done on initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns))) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString())) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0); } } } // for (SurfacePattern pat : patterns.keySet()) { // String patternStr = pat.toString(); // // TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr); // if (pat == null || p == null) // throw new RuntimeException("why is the pattern " + pat + " null?"); // // TokenSequenceMatcher m = p.getMatcher(sent); // while (m.find()) { // // int s = m.start("$term"); // int e = m.end("$term"); // // String phrase = ""; // String phraseLemma = ""; // boolean useWordNotLabeled = false; // boolean doNotUse = false; // for (int i = s; i < e; i++) { // CoreLabel l = sent.get(i); // l.set(PatternsAnnotations.MatchedPattern.class, true); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } // for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) { // if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) { // doNotUse = true; // } // } // boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords); // if (removePhrasesWithStopWords && containsStop) { // doNotUse = true; // } else { // if (!containsStop || !removeStopWordsFromSelectedPhrases) { // // if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) { // useWordNotLabeled = true; // } // phrase += " " + l.word(); // phraseLemma += " " + l.lemma(); // // } // } // } // if (!doNotUse && useWordNotLabeled) { // phrase = phrase.trim(); // phraseLemma = phraseLemma.trim(); // allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0); // } // } // } return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }