Example #1
0
 public virtual void TestSubsumesArray()
 {
     string[] arr1 = new string[] { ",", "line", ",", "on" };
     string[] arr2 = new string[] { ",", "line", "," };
     NUnit.Framework.Assert.IsTrue(SurfacePattern.SubsumesArray(arr1, arr2));
     NUnit.Framework.Assert.IsFalse(SurfacePattern.SubsumesArray(arr2, null));
 }
Example #2
0
        public virtual void TestSimplerTokens()
        {
            IDictionary <Type, string> prev   = new _Dictionary_44();
            IDictionary <Type, string> next   = new _Dictionary_49();
            PatternToken               token  = new PatternToken("V", false, true, 2, null, false, false, null);
            SurfacePattern             p      = new SurfacePattern(CreateContext(prev), token, CreateContext(next), SurfacePatternFactory.Genre.Prevnext);
            IDictionary <Type, string> prev2  = new _Dictionary_58();
            IDictionary <Type, string> next2  = new _Dictionary_63();
            PatternToken               token2 = new PatternToken("V", false, true, 2, null, false, false, null);
            SurfacePattern             p2     = new SurfacePattern(CreateContext(prev2), token2, CreateContext(next2), SurfacePatternFactory.Genre.Prevnext);

            System.Diagnostics.Debug.Assert(p.CompareTo(p2) == 0);
            ICounter <SurfacePattern> pats = new ClassicCounter <SurfacePattern>();

            pats.SetCount(p, 1);
            pats.SetCount(p2, 1);
            System.Diagnostics.Debug.Assert(pats.Size() == 1);
            System.Console.Out.WriteLine("pats size is " + pats.Size());
            ConcurrentHashIndex <SurfacePattern> index = new ConcurrentHashIndex <SurfacePattern>();

            index.Add(p);
            index.Add(p2);
            System.Diagnostics.Debug.Assert(index.Count == 1);
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
Example #4
0
        public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords)
        {
            ICollection <SurfacePattern> prevpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> nextpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>();
            CoreLabel token = sent[i];
            string    tag   = null;

            if (usePOS4Pattern)
            {
                string fulltag = token.Tag();
                if (useCoarsePOS)
                {
                    tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2));
                }
                else
                {
                    tag = fulltag;
                }
            }
            string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));

            for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++)
            {
                IList <Token>  previousTokens   = new List <Token>();
                IList <string> originalPrev     = new List <string>();
                IList <string> originalNext     = new List <string>();
                IList <Token>  nextTokens       = new List <Token>();
                int            numStopWordsprev = 0;
                int            numStopWordsnext = 0;
                // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
                int          numNonStopWordsNext = 0;
                int          numNonStopWordsPrev = 0;
                bool         useprev             = false;
                bool         usenext             = false;
                PatternToken twithoutPOS         = null;
                //TODO: right now using numWordsCompoundMax.
                if (addPatWithoutPOS)
                {
                    twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                PatternToken twithPOS = null;
                if (usePOS4Pattern)
                {
                    twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                if (usePreviousContext)
                {
                    // int j = Math.max(0, i - 1);
                    int j         = i - 1;
                    int numTokens = 0;
                    while (numTokens < maxWin && j >= 0)
                    {
                        // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j--;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException("how come the class "
                        //                + answerClass.get(label) + " for token "
                        //                + tokenj.word() + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        if (!isLabeledO)
                        {
                            // numPrevTokensSpecial++;
                            previousTokens.Add(0, strgeneric);
                            // previousTokens.add(0,
                            // "[{answer:"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalPrev.Add(0, strOriginal);
                            numNonStopWordsPrev++;
                        }
                        else
                        {
                            if (tokenj.Word().StartsWith("http"))
                            {
                                useprev = false;
                                previousTokens.Clear();
                                originalPrev.Clear();
                                break;
                            }
                            else
                            {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                previousTokens.Add(0, str);
                                originalPrev.Add(0, tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsprev++;
                                }
                                else
                                {
                                    numNonStopWordsPrev++;
                                }
                            }
                        }
                        numTokens++;
                        j--;
                    }
                }
                if (useNextContext)
                {
                    int numTokens = 0;
                    int j         = i + 1;
                    while (numTokens < maxWin && j < sent.Count)
                    {
                        // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j++;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException(
                        //                "how come the dict annotation for token " + tokenj.word()
                        //                    + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        // boolean isLabeledO = tokenj.get(answerClass.get(label))
                        // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
                        if (!isLabeledO)
                        {
                            // numNextTokensSpecial++;
                            numNonStopWordsNext++;
                            nextTokens.Add(strgeneric);
                            // nextTokens.add("[{" + label + ":"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalNext.Add(strOriginal);
                        }
                        else
                        {
                            // originalNextStr += " "
                            // + tokenj.get(answerClass.get(label)).toString();
                            if (tokenj.Word().StartsWith("http"))
                            {
                                usenext = false;
                                nextTokens.Clear();
                                originalNext.Clear();
                                break;
                            }
                            else
                            {
                                // if (!tokenj.word().matches("[.,?()]")) {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                nextTokens.Add(str);
                                originalNext.Add(tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsnext++;
                                }
                                else
                                {
                                    numNonStopWordsNext++;
                                }
                            }
                        }
                        j++;
                        numTokens++;
                    }
                }
                // String prevContext = null, nextContext = null;
                // int numNonSpecialPrevTokens = previousTokens.size()
                // - numPrevTokensSpecial;
                // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;
                Token[] prevContext = null;
                //String[] prevContext = null;
                //String[] prevOriginalArr = null;
                // if (previousTokens.size() >= minWindow4Pattern
                // && (numStopWordsprev < numNonSpecialPrevTokens ||
                // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
                if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd))
                {
                    // prevContext = StringUtils.join(previousTokens, fw);
                    IList <Token>  prevContextList = new List <Token>();
                    IList <string> prevOriginal    = new List <string>();
                    foreach (Token p in previousTokens)
                    {
                        prevContextList.Add(p);
                        if (!fw.IsEmpty())
                        {
                            prevContextList.Add(fw);
                        }
                    }
                    // add fw and sw to the the originalprev
                    foreach (string p_1 in originalPrev)
                    {
                        prevOriginal.Add(p_1);
                        if (!fw.IsEmpty())
                        {
                            prevOriginal.Add(" FW ");
                        }
                    }
                    if (!sw.IsEmpty())
                    {
                        prevContextList.Add(sw);
                        prevOriginal.Add(" SW ");
                    }
                    // String str = prevContext + fw + sw;
                    if (IsASCII(StringUtils.Join(prevOriginal)))
                    {
                        prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]);
                        //prevOriginalArr = prevOriginal.toArray(new String[0]);
                        if (previousTokens.Count >= minWindow4Pattern)
                        {
                            if (twithoutPOS != null)
                            {
                                SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(pat);
                            }
                            if (twithPOS != null)
                            {
                                SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(patPOS);
                            }
                        }
                        useprev = true;
                    }
                }
                Token[] nextContext = null;
                //String [] nextOriginalArr = null;
                // if (nextTokens.size() > 0
                // && (numStopWordsnext < numNonSpecialNextTokens ||
                // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
                if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd))
                {
                    // nextContext = StringUtils.join(nextTokens, fw);
                    IList <Token>  nextContextList = new List <Token>();
                    IList <string> nextOriginal    = new List <string>();
                    if (!sw.IsEmpty())
                    {
                        nextContextList.Add(sw);
                        nextOriginal.Add(" SW ");
                    }
                    foreach (Token n in nextTokens)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextContextList.Add(fw);
                        }
                        nextContextList.Add(n);
                    }
                    foreach (string n_1 in originalNext)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextOriginal.Add(" FW ");
                        }
                        nextOriginal.Add(n_1);
                    }
                    if (nextTokens.Count >= minWindow4Pattern)
                    {
                        nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]);
                        //nextOriginalArr =  nextOriginal.toArray(new String[0]);
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(patPOS);
                        }
                    }
                    usenext = true;
                }
                if (useprev && usenext)
                {
                    // String strprev = prevContext + fw + sw;
                    // String strnext = sw + fw + nextContext;
                    if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern)
                    {
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(patPOS);
                        }
                    }
                }
            }
            //    Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
            //        prevpatterns, nextpatterns, prevnextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prev patterns are " + prevpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " next patterns are " + nextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prevnext patterns are " + prevnextpatterns);
            //getPatternIndex().finishCommit();
            return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns));
        }