public SurfacePattern(Token[] prevContext, PatternToken token, Token[] nextContext, SurfacePatternFactory.Genre genre) : base(PatternFactory.PatternType.Surface) { this.SetPrevContext(prevContext); this.SetNextContext(nextContext); this.SetToken(token); this.genre = genre; hashcode = ToString().GetHashCode(); }
public static bool SameRestrictions(Edu.Stanford.Nlp.Patterns.Surface.SurfacePattern p1, Edu.Stanford.Nlp.Patterns.Surface.SurfacePattern p2) { PatternToken token1 = p1.token; PatternToken token2 = p2.token; if (token1.Equals(token2)) { return(true); } else { return(false); } }
public virtual void TestSimplerTokens() { IDictionary <Type, string> prev = new _Dictionary_44(); IDictionary <Type, string> next = new _Dictionary_49(); PatternToken token = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext); IDictionary <Type, string> prev2 = new _Dictionary_58(); IDictionary <Type, string> next2 = new _Dictionary_63(); PatternToken token2 = new PatternToken("V", false, true, 2, null, false, false, null); SurfacePattern p2 = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext); System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0); ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>(); pats.SetCount(p, 1); pats.SetCount(p2, 1); System.Diagnostics.Debug.Assert(pats.Size() == 1); System.Console.Out.WriteLine("pats size is " + pats.Size()); ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>(); index.Add(p); index.Add(p2); System.Diagnostics.Debug.Assert(index.Count == 1); }
public virtual void SetToken(PatternToken token) { this.token = token; }
public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords) { ICollection <SurfacePattern> prevpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> nextpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>(); CoreLabel token = sent[i]; string tag = null; if (usePOS4Pattern) { string fulltag = token.Tag(); if (useCoarsePOS) { tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2)); } else { tag = fulltag; } } string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) { IList <Token> previousTokens = new List <Token>(); IList <string> originalPrev = new List <string>(); IList <string> originalNext = new List <string>(); IList <Token> nextTokens = new List <Token>(); int numStopWordsprev = 0; int numStopWordsnext = 0; // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0; int numNonStopWordsNext = 0; int numNonStopWordsPrev = 0; bool useprev = false; bool usenext = false; PatternToken twithoutPOS = null; //TODO: right now using numWordsCompoundMax. if (addPatWithoutPOS) { twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } PatternToken twithPOS = null; if (usePOS4Pattern) { twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } if (usePreviousContext) { // int j = Math.max(0, i - 1); int j = i - 1; int numTokens = 0; while (numTokens < maxWin && j >= 0) { // for (int j = Math.max(i - maxWin, 0); j < i; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j--; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException("how come the class " // + answerClass.get(label) + " for token " // + tokenj.word() + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; if (!isLabeledO) { // numPrevTokensSpecial++; previousTokens.Add(0, strgeneric); // previousTokens.add(0, // "[{answer:" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalPrev.Add(0, strOriginal); numNonStopWordsPrev++; } else { if (tokenj.Word().StartsWith("http")) { useprev = false; previousTokens.Clear(); originalPrev.Clear(); break; } else { Token str = SurfacePattern.GetContextToken(tokenj); previousTokens.Add(0, str); originalPrev.Add(0, tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsprev++; } else { numNonStopWordsPrev++; } } } numTokens++; j--; } } if (useNextContext) { int numTokens = 0; int j = i + 1; while (numTokens < maxWin && j < sent.Count) { // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j++; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException( // "how come the dict annotation for token " + tokenj.word() // + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; // boolean isLabeledO = tokenj.get(answerClass.get(label)) // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); if (!isLabeledO) { // numNextTokensSpecial++; numNonStopWordsNext++; nextTokens.Add(strgeneric); // nextTokens.add("[{" + label + ":" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalNext.Add(strOriginal); } else { // originalNextStr += " " // + tokenj.get(answerClass.get(label)).toString(); if (tokenj.Word().StartsWith("http")) { usenext = false; nextTokens.Clear(); originalNext.Clear(); break; } else { // if (!tokenj.word().matches("[.,?()]")) { Token str = SurfacePattern.GetContextToken(tokenj); nextTokens.Add(str); originalNext.Add(tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsnext++; } else { numNonStopWordsNext++; } } } j++; numTokens++; } } // String prevContext = null, nextContext = null; // int numNonSpecialPrevTokens = previousTokens.size() // - numPrevTokensSpecial; // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial; Token[] prevContext = null; //String[] prevContext = null; //String[] prevOriginalArr = null; // if (previousTokens.size() >= minWindow4Pattern // && (numStopWordsprev < numNonSpecialPrevTokens || // numNonSpecialPrevTokens > numMinStopWordsToAdd)) { if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) { // prevContext = StringUtils.join(previousTokens, fw); IList <Token> prevContextList = new List <Token>(); IList <string> prevOriginal = new List <string>(); foreach (Token p in previousTokens) { prevContextList.Add(p); if (!fw.IsEmpty()) { prevContextList.Add(fw); } } // add fw and sw to the the originalprev foreach (string p_1 in originalPrev) { prevOriginal.Add(p_1); if (!fw.IsEmpty()) { prevOriginal.Add(" FW "); } } if (!sw.IsEmpty()) { prevContextList.Add(sw); prevOriginal.Add(" SW "); } // String str = prevContext + fw + sw; if (IsASCII(StringUtils.Join(prevOriginal))) { prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]); //prevOriginalArr = prevOriginal.toArray(new String[0]); if (previousTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(patPOS); } } useprev = true; } } Token[] nextContext = null; //String [] nextOriginalArr = null; // if (nextTokens.size() > 0 // && (numStopWordsnext < numNonSpecialNextTokens || // numNonSpecialNextTokens > numMinStopWordsToAdd)) { if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) { // nextContext = StringUtils.join(nextTokens, fw); IList <Token> nextContextList = new List <Token>(); IList <string> nextOriginal = new List <string>(); if (!sw.IsEmpty()) { nextContextList.Add(sw); nextOriginal.Add(" SW "); } foreach (Token n in nextTokens) { if (!fw.IsEmpty()) { nextContextList.Add(fw); } nextContextList.Add(n); } foreach (string n_1 in originalNext) { if (!fw.IsEmpty()) { nextOriginal.Add(" FW "); } nextOriginal.Add(n_1); } if (nextTokens.Count >= minWindow4Pattern) { nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]); //nextOriginalArr = nextOriginal.toArray(new String[0]); if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(patPOS); } } usenext = true; } if (useprev && usenext) { // String strprev = prevContext + fw + sw; // String strnext = sw + fw + nextContext; if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(patPOS); } } } } // Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // prevpatterns, nextpatterns, prevnextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prev patterns are " + prevpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " next patterns are " + nextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prevnext patterns are " + prevnextpatterns); //getPatternIndex().finishCommit(); return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns)); }