public virtual void TestSubsumesArray() { string[] arr1 = new string[] { ",", "line", ",", "on" }; string[] arr2 = new string[] { ",", "line", "," }; NUnit.Framework.Assert.IsTrue(SurfacePattern.SubsumesArray(arr1, arr2)); NUnit.Framework.Assert.IsFalse(SurfacePattern.SubsumesArray(arr2, null)); }
public virtual void TestSimplerTokens() { IDictionary <Type, string> prev = new _Dictionary_44(); IDictionary <Type, string> next = new _Dictionary_49(); PatternToken token = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext); IDictionary <Type, string> prev2 = new _Dictionary_58(); IDictionary <Type, string> next2 = new _Dictionary_63(); PatternToken token2 = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p2 = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext); System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0); ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>(); pats.SetCount(p, 1); pats.SetCount(p2, 1); System.Diagnostics.Debug.Assert(pats.Size() == 1); System.Console.Out.WriteLine("pats size is " + pats.Size()); ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>(); index.Add(p); index.Add(p2); System.Diagnostics.Debug.Assert(index.Count == 1); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords) { ICollection <SurfacePattern> prevpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> nextpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>(); CoreLabel token = sent[i]; string tag = null; if (usePOS4Pattern) { string fulltag = token.Tag(); if (useCoarsePOS) { tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2)); } else { tag = fulltag; } } string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) { IList <Token> previousTokens = new List <Token>(); IList <string> originalPrev = new List <string>(); IList <string> originalNext = new List <string>(); IList <Token> nextTokens = new List <Token>(); int numStopWordsprev = 0; int numStopWordsnext = 0; // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0; int numNonStopWordsNext = 0; int numNonStopWordsPrev = 0; bool useprev = false; bool usenext = false; PatternToken twithoutPOS = null; //TODO: right now using numWordsCompoundMax. if (addPatWithoutPOS) { twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } PatternToken twithPOS = null; if (usePOS4Pattern) { twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } if (usePreviousContext) { // int j = Math.max(0, i - 1); int j = i - 1; int numTokens = 0; while (numTokens < maxWin && j >= 0) { // for (int j = Math.max(i - maxWin, 0); j < i; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j--; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException("how come the class " // + answerClass.get(label) + " for token " // + tokenj.word() + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; if (!isLabeledO) { // numPrevTokensSpecial++; previousTokens.Add(0, strgeneric); // previousTokens.add(0, // "[{answer:" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalPrev.Add(0, strOriginal); numNonStopWordsPrev++; } else { if (tokenj.Word().StartsWith("http")) { useprev = false; previousTokens.Clear(); originalPrev.Clear(); break; } else { Token str = SurfacePattern.GetContextToken(tokenj); previousTokens.Add(0, str); originalPrev.Add(0, tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsprev++; } else { numNonStopWordsPrev++; } } } numTokens++; j--; } } if (useNextContext) { int numTokens = 0; int j = i + 1; while (numTokens < maxWin && j < sent.Count) { // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j++; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException( // "how come the dict annotation for token " + tokenj.word() // + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; // boolean isLabeledO = tokenj.get(answerClass.get(label)) // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); if (!isLabeledO) { // numNextTokensSpecial++; numNonStopWordsNext++; nextTokens.Add(strgeneric); // nextTokens.add("[{" + label + ":" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalNext.Add(strOriginal); } else { // originalNextStr += " " // + tokenj.get(answerClass.get(label)).toString(); if (tokenj.Word().StartsWith("http")) { usenext = false; nextTokens.Clear(); originalNext.Clear(); break; } else { // if (!tokenj.word().matches("[.,?()]")) { Token str = SurfacePattern.GetContextToken(tokenj); nextTokens.Add(str); originalNext.Add(tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsnext++; } else { numNonStopWordsNext++; } } } j++; numTokens++; } } // String prevContext = null, nextContext = null; // int numNonSpecialPrevTokens = previousTokens.size() // - numPrevTokensSpecial; // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial; Token[] prevContext = null; //String[] prevContext = null; //String[] prevOriginalArr = null; // if (previousTokens.size() >= minWindow4Pattern // && (numStopWordsprev < numNonSpecialPrevTokens || // numNonSpecialPrevTokens > numMinStopWordsToAdd)) { if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) { // prevContext = StringUtils.join(previousTokens, fw); IList <Token> prevContextList = new List <Token>(); IList <string> prevOriginal = new List <string>(); foreach (Token p in previousTokens) { prevContextList.Add(p); if (!fw.IsEmpty()) { prevContextList.Add(fw); } } // add fw and sw to the the originalprev foreach (string p_1 in originalPrev) { prevOriginal.Add(p_1); if (!fw.IsEmpty()) { prevOriginal.Add(" FW "); } } if (!sw.IsEmpty()) { prevContextList.Add(sw); prevOriginal.Add(" SW "); } // String str = prevContext + fw + sw; if (IsASCII(StringUtils.Join(prevOriginal))) { prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]); //prevOriginalArr = prevOriginal.toArray(new String[0]); if (previousTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(patPOS); } } useprev = true; } } Token[] nextContext = null; //String [] nextOriginalArr = null; // if (nextTokens.size() > 0 // && (numStopWordsnext < numNonSpecialNextTokens || // numNonSpecialNextTokens > numMinStopWordsToAdd)) { if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) { // nextContext = StringUtils.join(nextTokens, fw); IList <Token> nextContextList = new List <Token>(); IList <string> nextOriginal = new List <string>(); if (!sw.IsEmpty()) { nextContextList.Add(sw); nextOriginal.Add(" SW "); } foreach (Token n in nextTokens) { if (!fw.IsEmpty()) { nextContextList.Add(fw); } nextContextList.Add(n); } foreach (string n_1 in originalNext) { if (!fw.IsEmpty()) { nextOriginal.Add(" FW "); } nextOriginal.Add(n_1); } if (nextTokens.Count >= minWindow4Pattern) { nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]); //nextOriginalArr = nextOriginal.toArray(new String[0]); if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(patPOS); } } usenext = true; } if (useprev && usenext) { // String strprev = prevContext + fw + sw; // String strnext = sw + fw + nextContext; if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(patPOS); } } } } // Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // prevpatterns, nextpatterns, prevnextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prev patterns are " + prevpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " next patterns are " + nextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prevnext patterns are " + prevnextpatterns); //getPatternIndex().finishCommit(); return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns)); }