Пример #1
0
        public virtual ICollection <string> GetFileSentIds(CollectionValuedMap <string, string> relevantWords)
        {
            ICollection <string> sentids = null;

            foreach (KeyValuePair <string, ICollection <string> > en in relevantWords)
            {
                foreach (string en2 in en.Value)
                {
                    if (!stopWords.Contains(en2.ToLower()))
                    {
                        string w = CombineKeyValue(en.Key, en2);
                        ICollection <string> st = index[w];
                        if (st == null)
                        {
                            //log.info("\n\nWARNING: INDEX HAS NO SENTENCES FOR " + w);
                            return(Java.Util.Collections.EmptySet());
                        }
                        //throw new RuntimeException("How come the index does not have sentences for " + w);
                        if (sentids == null)
                        {
                            sentids = st;
                        }
                        else
                        {
                            sentids = CollectionUtils.Intersection(sentids, st);
                        }
                    }
                }
            }
            return(sentids);
        }
        public virtual IList <IList <Mention> > ExtractGoldMentions(CoNLL2011DocumentReader.Document conllDoc)
        {
            IList <ICoreMap>         sentences                   = conllDoc.GetAnnotation().Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <IList <Mention> > allGoldMentions             = new List <IList <Mention> >();
            CollectionValuedMap <string, ICoreMap> corefChainMap = conllDoc.GetCorefChainMap();

            for (int i = 0; i < sentences.Count; i++)
            {
                allGoldMentions.Add(new List <Mention>());
            }
            int maxCorefClusterId = -1;

            foreach (string corefIdStr in corefChainMap.Keys)
            {
                int id = System.Convert.ToInt32(corefIdStr);
                if (id > maxCorefClusterId)
                {
                    maxCorefClusterId = id;
                }
            }
            int newMentionID = maxCorefClusterId + 1;

            foreach (KeyValuePair <string, ICollection <ICoreMap> > idChainEntry in corefChainMap)
            {
                int id = System.Convert.ToInt32(idChainEntry.Key);
                int clusterMentionCnt = 0;
                foreach (ICoreMap m in idChainEntry.Value)
                {
                    clusterMentionCnt++;
                    Mention mention = new Mention();
                    mention.goldCorefClusterID = id;
                    if (clusterMentionCnt == 1)
                    {
                        // First mention in cluster
                        mention.mentionID   = id;
                        mention.originalRef = -1;
                    }
                    else
                    {
                        mention.mentionID   = newMentionID;
                        mention.originalRef = id;
                        newMentionID++;
                    }
                    if (maxID < mention.mentionID)
                    {
                        maxID = mention.mentionID;
                    }
                    int      sentIndex = m.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                    ICoreMap sent      = sentences[sentIndex];
                    mention.startIndex = m.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    mention.endIndex   = m.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    // will be set by arrange
                    mention.originalSpan = m.Get(typeof(CoreAnnotations.TokensAnnotation));
                    // Mention dependency graph is the enhanced dependency graph of the sentence
                    mention.dependency = sentences[sentIndex].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
                    allGoldMentions[sentIndex].Add(mention);
                }
            }
            return(allGoldMentions);
        }
Пример #3
0
 /*
  * void runParallelApplyPats(Map<String, List<CoreLabel>> sents, Set<String> sentIds, String label, Counter<E> patternsLearnedThisIter,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
  * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws InterruptedException, ExecutionException {
  * List<String> keyset = new ArrayList<String>(sentIds);
  * List<String> notAllowedClasses = new ArrayList<String>();
  *
  * if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){
  * for(String l: constVars.getAnswerClass().keySet()){
  * if(!l.equals(label)){
  * notAllowedClasses.add(l+":"+l);
  * }
  * }
  * notAllowedClasses.add("OTHERSEM:OTHERSEM");
  * }
  *
  * //Apply the patterns and extract candidate phrases
  * int num = 0;
  * if (constVars.numThreads == 1)
  * num = keyset.size();
  * else
  * num = keyset.size() / (constVars.numThreads - 1);
  * ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
  * List<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>> list = new ArrayList<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>>();
  * for (int i = 0; i < constVars.numThreads; i++) {
  *
  * Callable<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> task = null;
  * Map<TokenSequencePattern, Integer> patternsLearnedThisIterConverted = new HashMap<TokenSequencePattern , Integer>();
  * for (Integer pindex : patternsLearnedThisIter.keySet()){
  * SurfacePattern p = constVars.getPatternIndex().get(pindex);
  * TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), p.toString(notAllowedClasses));
  * patternsLearnedThisIterConverted.put(pat, pindex);
  * }
  *
  * task = new ApplyPatternsMulti(sents, keyset.subList(i * num,
  * Math.min(keyset.size(), (i + 1) * num)), patternsLearnedThisIterConverted, label,
  * constVars.removeStopWordsFromSelectedPhrases,
  * constVars.removePhrasesWithStopWords, constVars);
  *
  * Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> submit = executor
  * .submit(task);
  * list.add(submit);
  * }
  *
  * // Now retrieve the result
  * for (Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> future : list) {
  * try{
  * Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>> result = future
  * .get();
  * wordsandLemmaPatExtracted.addAll(result.first());
  * matchedTokensByPat.addAll(result.second());
  * }catch(Exception e){
  * executor.shutdownNow();
  * throw new RuntimeException(e);
  * }
  * }
  * executor.shutdown();
  * }
  */
 protected internal virtual IDictionary <E, IDictionary <string, DataInstance> > GetSentences(IDictionary <E, ICollection <string> > sentids)
 {
     try
     {
         ICollection <File> files = new HashSet <File>();
         IDictionary <E, IDictionary <string, DataInstance> > sentsAll = new Dictionary <E, IDictionary <string, DataInstance> >();
         CollectionValuedMap <string, E> sentIds2Pats = new CollectionValuedMap <string, E>();
         foreach (KeyValuePair <E, ICollection <string> > setEn in sentids)
         {
             if (!sentsAll.Contains(setEn.Key))
             {
                 sentsAll[setEn.Key] = new Dictionary <string, DataInstance>();
             }
             foreach (string s in setEn.Value)
             {
                 sentIds2Pats.Add(s, setEn.Key);
                 if (constVars.batchProcessSents)
                 {
                     File f = Data.sentId2File[s];
                     System.Diagnostics.Debug.Assert(f != null, "How come no file for sentence " + s);
                     files.Add(f);
                 }
             }
         }
         if (constVars.batchProcessSents)
         {
             foreach (File f in files)
             {
                 IDictionary <string, DataInstance> sentsf = IOUtils.ReadObjectFromFile(f);
                 foreach (KeyValuePair <string, DataInstance> s in sentsf)
                 {
                     foreach (E pat in sentIds2Pats[s.Key])
                     {
                         sentsAll[pat][s.Key] = s.Value;
                     }
                 }
             }
         }
         else
         {
             foreach (KeyValuePair <string, DataInstance> s in Data.sents)
             {
                 foreach (E pat in sentIds2Pats[s.Key])
                 {
                     sentsAll[pat][s.Key] = s.Value;
                 }
             }
         }
         //      /System.out.println("All sentences are " + sentsAll.entrySet().stream().map( x -> constVars.patternIndex.get(x.getKey())+":"+x.getValue()).collect(Collectors.toList()));
         return(sentsAll);
     }
     catch (TypeLoadException e)
     {
         throw new Exception(e);
     }
     catch (IOException e1)
     {
         throw new Exception(e1);
     }
 }
        //  SentenceIndex.SentenceIteratorWithWords queryIndex(SurfacePattern pat){
        //
        //
        //    String[] n = pat.getSimplerTokensNext();
        //    String[] pr = pat.getSimplerTokensPrev();
        //    boolean rest = false;
        //    if(n!=null){
        //      for(String e: n){
        //        if(!specialWords.contains(e)){
        //          rest = true;
        //          break;
        //        }
        //      }
        //    }
        //    if(rest == false && pr!=null){
        //      for(String e: pr){
        //        if(!specialWords.contains(e) && !stopWords.contains(e)){
        //          rest = true;
        //          break;
        //        }
        //      }
        //    }
        //
        //  }
        /// <summary>give all sentences that have these words</summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Apache.Lucene.Queryparser.Classic.ParseException"/>
        internal virtual ICollection <string> QueryIndexGetSentences(CollectionValuedMap <string, string> words)
        {
            SetIndexReaderSearcher();
            BooleanQuery query = new BooleanQuery();
            string       pkey  = Token.GetKeyForClass(typeof(PatternsAnnotations.ProcessedTextAnnotation));

            foreach (KeyValuePair <string, ICollection <string> > en in words)
            {
                bool processedKey = en.Key.Equals(pkey);
                foreach (string en2 in en.Value)
                {
                    if (!processedKey || !stopWords.Contains(en2.ToLower()))
                    {
                        query.Add(new BooleanClause(new TermQuery(new Term(en.Key, en2)), BooleanClause.Occur.Must));
                    }
                }
            }
            //query.add(new BooleanClause(new TermQuery(new Term("textannotation","sonal")), BooleanClause.Occur.MUST));
            //    String queryStr = "";
            //    for(Map.Entry<String, Collection<String>> en: words.entrySet()){
            //      for(String en2: en.getValue()){
            //        queryStr+= " " + en.getKey() + ":"+en2;
            //      }
            //    }
            //    QueryParser queryParser = new QueryParser(Version.LUCENE_42, "sentence", analyzer);
            //
            //    queryParser.setDefaultOperator(QueryParser.Operator.AND);
            //
            //    Query query = queryParser.parse(queryStr);
            //Map<String, List<CoreLabel>> sents = null;
            TopDocs tp = searcher.Search(query, int.MaxValue);
            ICollection <string> sentids = new HashSet <string>();

            if (tp.totalHits > 0)
            {
                foreach (ScoreDoc s in tp.scoreDocs)
                {
                    int docId = s.doc;
                    Org.Apache.Lucene.Document.Document d = searcher.Doc(docId);
                    //        byte[] sent = d.getBinaryValue("tokens").bytes;
                    //        if(saveTokens) {
                    //          sents = new HashMap<String, List<CoreLabel>>();
                    //          List<CoreLabel> tokens = readProtoBufAnnotation(sent);
                    //          sents.put(d.get("sentid"), tokens);
                    //        } else{
                    sentids.Add(d.Get("sentid"));
                }
            }
            else
            {
                //}
                throw new Exception("how come no documents for " + words + ". Query formed is " + query);
            }
            //System.out.println("number of sentences for tokens " + words + " are " + sentids);
            //    if(!saveTokens){
            //      sents = getSentences(sentids);
            //    }
            return(sentids);
        }
        public override CollectionValuedMap <string, string> GetRelevantWords()
        {
            CollectionValuedMap <string, string> relwordsThisPat = new CollectionValuedMap <string, string>();

            foreach (Pair <Token, GrammaticalRelation> r in relations)
            {
                GetRelevantWordsBase(r.First(), relwordsThisPat);
            }
            return(relwordsThisPat);
        }
Пример #6
0
        public override CollectionValuedMap <string, string> GetRelevantWords()
        {
            CollectionValuedMap <string, string> relwordsThisPat = new CollectionValuedMap <string, string>();

            Token[] next = GetNextContext();
            GetRelevantWordsBase(next, relwordsThisPat);
            Token[] prev = GetPrevContext();
            GetRelevantWordsBase(prev, relwordsThisPat);
            return(relwordsThisPat);
        }
Пример #7
0
 protected internal static void GetRelevantWordsBase(Token t, CollectionValuedMap <string, string> relWords)
 {
     if (t != null)
     {
         IDictionary <string, string> str = t.ClassORRestrictionsAsString();
         if (str != null)
         {
             relWords.AddAll(str);
         }
     }
 }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
Пример #9
0
 /// <summary>Mark twin mentions: All mention boundaries should be matched</summary>
 private void FindTwinMentionsStrict()
 {
     for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.Count; sentNum++)
     {
         IList <Mention> golds    = goldOrderedMentionsBySentence[sentNum];
         IList <Mention> predicts = predictedOrderedMentionsBySentence[sentNum];
         // For CoNLL training there are some documents with gold mentions with the same position offsets
         // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
         //  (Packwood - Roth)
         CollectionValuedMap <IntPair, Mention> goldMentionPositions = new CollectionValuedMap <IntPair, Mention>();
         foreach (Mention g in golds)
         {
             IntPair ip = new IntPair(g.startIndex, g.endIndex);
             if (goldMentionPositions.Contains(ip))
             {
                 StringBuilder existingMentions = new StringBuilder();
                 foreach (Mention eg in goldMentionPositions[ip])
                 {
                     if (existingMentions.Length > 0)
                     {
                         existingMentions.Append(",");
                     }
                     existingMentions.Append(eg.mentionID);
                 }
                 SieveCoreferenceSystem.logger.Warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.SpanToString());
             }
             //assert(!goldMentionPositions.containsKey(ip));
             goldMentionPositions.Add(new IntPair(g.startIndex, g.endIndex), g);
         }
         foreach (Mention p in predicts)
         {
             IntPair pos = new IntPair(p.startIndex, p.endIndex);
             if (goldMentionPositions.Contains(pos))
             {
                 ICollection <Mention> cm = goldMentionPositions[pos];
                 Mention g_1 = cm.GetEnumerator().Current;
                 cm.Remove(g_1);
                 p.mentionID  = g_1.mentionID;
                 p.twinless   = false;
                 g_1.twinless = false;
             }
         }
         // temp: for making easy to recognize twinless mention
         foreach (Mention p_1 in predicts)
         {
             if (p_1.twinless)
             {
                 p_1.mentionID += 10000;
             }
         }
     }
 }
        public virtual CollectionValuedMap <string, JollyDayHolidays.JollyHoliday> GetAllHolidaysCVMap(ICollection <Holiday> allHolidays)
        {
            CollectionValuedMap <string, JollyDayHolidays.JollyHoliday> map = new CollectionValuedMap <string, JollyDayHolidays.JollyHoliday>();

            foreach (Holiday h in allHolidays)
            {
                string descKey = h.GetDescriptionPropertiesKey();
                if (descKey != null)
                {
                    descKey = descKey.ReplaceAll(".*\\.", string.Empty);
                    JollyDayHolidays.JollyHoliday jh = new JollyDayHolidays.JollyHoliday(descKey, holidayManager, h);
                    map.Add(jh.label, jh);
                }
            }
            return(map);
        }
Пример #11
0
 private void ReadSRLFile(string srlFile)
 {
     srlMap = Generics.NewHashMap();
     foreach (string line in ObjectBank.GetLineIterator(new File(srlFile)))
     {
         string[] bits     = line.Split("\\s+", 3);
         string   filename = bits[0];
         int      treeNum  = System.Convert.ToInt32(bits[1]);
         string   info     = bits[2];
         CollectionValuedMap <int, string> cvm = srlMap[filename];
         if (cvm == null)
         {
             cvm = new CollectionValuedMap <int, string>();
             srlMap[filename] = cvm;
         }
         cvm.Add(treeNum, info);
     }
 }
        /// <summary>
        /// Given a CollectionValued Map of vectors, treats outer key as label for each
        /// set of inner vectors.
        /// </summary>
        /// <remarks>
        /// Given a CollectionValued Map of vectors, treats outer key as label for each
        /// set of inner vectors.
        /// NOTE: if l2NormalizeVectors is T, creates a copy of each vector and applies
        /// l2Normalize to it.
        /// </remarks>
        public virtual KNNClassifier <K, V> Train(CollectionValuedMap <K, ICounter <V> > vecBag)
        {
            KNNClassifier <K, V>           classifier = new KNNClassifier <K, V>(k, weightedVotes, l2NormalizeVectors);
            ICollection <RVFDatum <K, V> > instances  = new List <RVFDatum <K, V> >();

            foreach (K label in vecBag.Keys)
            {
                RVFDatum <K, V> datum;
                foreach (ICounter <V> vector in vecBag[label])
                {
                    if (l2NormalizeVectors)
                    {
                        datum = new RVFDatum <K, V>(Counters.L2Normalize(new ClassicCounter <V>(vector)), label);
                    }
                    else
                    {
                        datum = new RVFDatum <K, V>(vector, label);
                    }
                    instances.Add(datum);
                }
            }
            classifier.AddInstances(instances);
            return(classifier);
        }
Пример #13
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
Пример #14
0
        public virtual void ApplyPats(ICounter <E> patterns, string label, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords
                                      )
        {
            //   Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
            //   Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
            //    Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
            foreach (KeyValuePair <string, Env> en in constVars.env)
            {
                en.Value.GetVariables().PutAll(ConstantsAndVariables.globalEnv.GetVariables());
            }
            IDictionary <E, IDictionary <string, DataInstance> > sentencesForPatterns = GetSentences(constVars.invertedIndex.QueryIndex(patterns.KeySet()));

            foreach (KeyValuePair <E, IDictionary <string, DataInstance> > en_1 in sentencesForPatterns)
            {
                RunParallelApplyPats(en_1.Value, label, en_1.Key, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
            }
            Redwood.Log(Redwood.Dbg, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.Size());
        }
Пример #15
0
        /// <summary>Load a collection of parse trees from the file of given name.</summary>
        /// <remarks>
        /// Load a collection of parse trees from the file of given name.
        /// Each tree may optionally be encased in parens to allow for Penn
        /// Treebank style trees.
        /// This methods implements the <code>FileProcessor</code> interface.
        /// </remarks>
        /// <param name="file">file to load a tree from</param>
        public void ProcessFile(File file)
        {
            ITreeReader tr = null;
            // SRL stuff
            CollectionValuedMap <int, string> srlMap = null;

            if (this.srlMap != null)
            {
                // there must be a better way ...
                string filename = file.GetAbsolutePath();
                foreach (string suffix in this.srlMap.Keys)
                {
                    if (filename.EndsWith(suffix))
                    {
                        srlMap = this.srlMap[suffix];
                        break;
                    }
                }
                if (srlMap == null)
                {
                    log.Info("could not find SRL entries for file: " + file);
                }
            }
            try
            {
                // maybe print file name to stdout to get some feedback
                // could throw an IO exception if can't open for reading
                tr = TreeReaderFactory().NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), Encoding())));
                int  sentIndex = 0;
                Tree pt;
                while ((pt = tr.ReadTree()) != null)
                {
                    if (pt.Label() is IHasIndex)
                    {
                        // so we can trace where this tree came from
                        IHasIndex hi = (IHasIndex)pt.Label();
                        hi.SetDocID(file.GetName());
                        hi.SetSentIndex(sentIndex);
                    }
                    if (srlMap == null)
                    {
                        parseTrees.Add(pt);
                    }
                    else
                    {
                        ICollection <string> srls = srlMap[sentIndex];
                        //           pt.pennPrint();
                        //           log.info(srls);
                        parseTrees.Add(pt);
                        if (srls.IsEmpty())
                        {
                        }
                        else
                        {
                            //            parseTrees.add(pt);
                            foreach (string srl in srls)
                            {
                                //              Tree t = pt.deepCopy();
                                string[] bits      = srl.Split("\\s+");
                                int      verbIndex = System.Convert.ToInt32(bits[0]);
                                string   lemma     = bits[2].Split("\\.")[0];
                                //              Tree verb = Trees.getTerminal(t, verbIndex);
                                Tree verb = Edu.Stanford.Nlp.Trees.Trees.GetTerminal(pt, verbIndex);
                                //              ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL);
                                ((CoreLabel)verb.Label()).Set(typeof(CoreAnnotations.CoNLLPredicateAnnotation), true);
                                for (int i = 4; i < bits.Length; i++)
                                {
                                    string   arg = bits[i];
                                    string[] bits1;
                                    if (arg.IndexOf("ARGM") >= 0)
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    else
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    string locs    = bits1[0];
                                    string argType = bits1[1];
                                    if (argType.Equals("rel"))
                                    {
                                        continue;
                                    }
                                    foreach (string loc in locs.Split("[*,]"))
                                    {
                                        bits1 = loc.Split(":");
                                        int term   = System.Convert.ToInt32(bits1[0]);
                                        int height = System.Convert.ToInt32(bits1[1]);
                                        //                  Tree t1 = Trees.getPreTerminal(t, term);
                                        Tree t1 = Edu.Stanford.Nlp.Trees.Trees.GetPreTerminal(pt, term);
                                        for (int j = 0; j < height; j++)
                                        {
                                            //                    t1 = t1.parent(t);
                                            t1 = t1.Parent(pt);
                                        }
                                        IDictionary <int, string> roleMap = ((CoreLabel)t1.Label()).Get(typeof(CoreAnnotations.CoNLLSRLAnnotation));
                                        if (roleMap == null)
                                        {
                                            roleMap = Generics.NewHashMap();
                                            ((CoreLabel)t1.Label()).Set(typeof(CoreAnnotations.CoNLLSRLAnnotation), roleMap);
                                        }
                                        roleMap[verbIndex] = argType;
                                    }
                                }
                            }
                        }
                    }
                    //                  ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, SRL_ID.ARG);
                    //               for (Tree t1 : t) {
                    //                 if (t1.isLeaf()) { continue; }
                    //                 CoreLabel fl = (CoreLabel)t1.label();
                    //                 if (fl.value() == null) { continue; }
                    //                 if (!fl.has(SRLIDAnnotation.class)) {
                    //                   boolean allNone = true;
                    //                   for (Tree t2 : t1) {
                    //                     SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class);
                    //                     if (s == SRL_ID.ARG || s == SRL_ID.REL) {
                    //                       allNone = false;
                    //                       break;
                    //                     }
                    //                   }
                    //                   if (allNone) {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO);
                    //                   } else {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.NO);
                    //                   }
                    //                 }
                    //               }
                    //              parseTrees.add(t);
                    sentIndex++;
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException("MemoryTreebank.processFile IOException in file " + file, e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(tr);
            }
        }
Пример #16
0
        private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection
                                          <CandidatePhrase> alreadyLabeledWords)
        {
            Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences ");
            IList <string> notAllowedClasses = new List <string>();
            IList <string> sentids           = CollectionUtils.ToList(sents.Keys);

            if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass)
            {
                foreach (string l in constVars.GetAnswerClass().Keys)
                {
                    if (!l.Equals(label))
                    {
                        notAllowedClasses.Add(l);
                    }
                }
                notAllowedClasses.Add("OTHERSEM");
            }
            IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
            IDictionary <SemgrexPattern, E>       depPatternsLearnedThisIterConverted     = null;

            if (constVars.patternType.Equals(PatternFactory.PatternType.Surface))
            {
                surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>();
                string patternStr = null;
                try
                {
                    patternStr = pattern.ToString(notAllowedClasses);
                    TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr));
                    surfacePatternsLearnedThisIterConverted[pat] = pattern;
                }
                catch (Exception e)
                {
                    log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
                    throw;
                }
            }
            else
            {
                if (constVars.patternType.Equals(PatternFactory.PatternType.Dep))
                {
                    depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>();
                    SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables()));
                    depPatternsLearnedThisIterConverted[pat] = pattern;
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            //Apply the patterns and extract candidate phrases
            int num;
            int numThreads = constVars.numThreads;

            //If number of sentences is less, do not create so many threads
            if (sents.Count < 50)
            {
                numThreads = 1;
            }
            if (numThreads == 1)
            {
                num = sents.Count;
            }
            else
            {
                num = sents.Count / (numThreads - 1);
            }
            IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads);
            IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E
                                                                                                                                                                                                                                                                            , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >();

            for (int i = 0; i < numThreads; i++)
            {
                ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null;
                if (pattern.type.Equals(PatternFactory.PatternType.Surface))
                {
                    //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
                    task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                             , constVars);
                }
                else
                {
                    task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                                , constVars);
                }
                IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task);
                list.Add(submit);
            }
            // Now retrieve the result
            foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list)
            {
                try
                {
                    Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get();
                    Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First());
                    wordsandLemmaPatExtracted.AddAll(result.First());
                    matchedTokensByPat.AddAll(result.Second());
                    Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third());
                }
                catch (Exception e)
                {
                    executor.ShutdownNow();
                    throw new Exception(e);
                }
            }
            executor.Shutdown();
        }
Пример #17
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public virtual ICounter <CandidatePhrase> LearnNewPhrases(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, CollectionValuedMap <E, Triple <string, int, int> > tokensMatchedPatterns
                                                                  , ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E, CandidatePhrase> patternsAndWords4Label, string
                                                                  identifier, ICollection <CandidatePhrase> ignoreWords)
        {
            bool computeProcDataFreq = false;

            if (Data.processedDataFreq == null)
            {
                computeProcDataFreq    = true;
                Data.processedDataFreq = new ClassicCounter <CandidatePhrase>();
                System.Diagnostics.Debug.Assert(Data.rawFreq != null);
            }
            ICollection <CandidatePhrase> alreadyIdentifiedWords = new HashSet <CandidatePhrase>(constVars.GetLearnedWords(label).KeySet());

            Sharpen.Collections.AddAll(alreadyIdentifiedWords, constVars.GetSeedLabelDictionary()[label]);
            ICounter <CandidatePhrase> words = LearnNewPhrasesPrivate(label, patternsForEachToken, patternsLearnedThisIter, allSelectedPatterns, alreadyIdentifiedWords, tokensMatchedPatterns, scoreForAllWordsThisIteration, terms, wordsPatExtracted, patternsAndWords4Label
                                                                      , identifier, ignoreWords, computeProcDataFreq);

            //constVars.addLabelDictionary(label, words.keySet());
            return(words);
        }
 //Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
 public virtual void PrintSubGraph(SemanticGraph g, IndexedWord w, IList <string> additionalCutOffRels, IList <string> textTokens, ICollection <string> listOfOutput, ICollection <IntPair> listOfOutputIndices, IList <IndexedWord> seenNodes, IList <IndexedWord
                                                                                                                                                                                                                                                       > doNotAddThese, bool findSubTrees, ICollection <ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, IPredicate <CoreLabel> acceptWord)
 {
     try
     {
         if (seenNodes.Contains(w))
         {
             return;
         }
         seenNodes.Add(w);
         if (doNotAddThese.Contains(w))
         {
             return;
         }
         IList <IndexedWord> andNodes = new List <IndexedWord>();
         DescendantsWithReln(g, w, "conj_and", new List <IndexedWord>(), andNodes);
         //System.out.println("and nodes are " + andNodes);
         foreach (IndexedWord w1 in andNodes)
         {
             PrintSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
         }
         Sharpen.Collections.AddAll(doNotAddThese, andNodes);
         IList <string> allCutOffRels = new List <string>();
         if (additionalCutOffRels != null)
         {
             Sharpen.Collections.AddAll(allCutOffRels, additionalCutOffRels);
         }
         Sharpen.Collections.AddAll(allCutOffRels, cutoffRelations);
         CollectionValuedMap <int, string> featPerToken = new CollectionValuedMap <int, string>();
         ICollection <string> feat = new List <string>();
         GetPatternsFromDataMultiClass.GetFeatures(g, w, true, feat, null);
         ICollection <IndexedWord> words = Descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);
         // words.addAll(andNodes);
         // if (includeSiblings == true) {
         // for (IndexedWord ws : g.getSiblings(w)) {
         // if (additionalCutOffNodes == null
         // || !additionalCutOffNodes.contains(g.reln(g.getParent(w),
         // ws).getShortName()))
         // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese));
         // }
         // }
         // if(afterand != null){
         // Set<IndexedWord> wordsAnd = descendants(g,afterand,
         // additionalCutOffNodes);
         // words.removeAll(wordsAnd);
         // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes);
         // }
         //System.out.println("words are " + words);
         if (words.Count > 0)
         {
             int min = int.MaxValue;
             int max = -1;
             foreach (IndexedWord word in words)
             {
                 if (word.Index() < min)
                 {
                     min = word.Index();
                 }
                 if (word.Index() > max)
                 {
                     max = word.Index();
                 }
             }
             IntPair indices;
             // Map<Integer, String> ph = new TreeMap<Integer, String>();
             // String phrase = "";
             // for (IndexedWord word : words) {
             // ph.put(word.index(), word.value());
             // }
             // phrase = StringUtils.join(ph.values(), " ");
             if ((max - min + 1) > maxPhraseLength)
             {
                 max = min + maxPhraseLength - 1;
             }
             indices = new IntPair(min - 1, max - 1);
             string phrase = StringUtils.Join(textTokens.SubList(min - 1, max), " ");
             phrase = phrase.Trim();
             feat.Add("LENGTH-" + (max - min + 1));
             for (int i = min; i <= max; i++)
             {
                 Sharpen.Collections.AddAll(feat, featPerToken[i]);
             }
             //System.out.println("phrase is " + phrase  + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
             ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.AsCounter(feat));
             if (!listOfOutput.Contains(phrase) && !doNotAddThese.Contains(phrase))
             {
                 //          if (sentElem != null) {
                 //            Element node = new Element(elemString, curNS);
                 //            node.addContent(phrase);
                 //            sentElem.addContent(node);
                 //          }
                 listOfOutput.Add(phrase);
                 if (!listOfOutputIndices.Contains(indices))
                 {
                     listOfOutputIndices.Add(indices);
                     extractedPhrases.Add(extractedPh);
                 }
                 if (findSubTrees == true)
                 {
                     foreach (IndexedWord word_1 in words)
                     {
                         if (!seenNodes.Contains(word_1))
                         {
                             PrintSubGraph(g, word_1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
                         }
                     }
                 }
             }
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
        /// <exception cref="System.Exception"/>
        public static ICollection <IndexedWord> Descendants(SemanticGraph g, IndexedWord vertex, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, bool ignoreCommonTags, IPredicate <CoreLabel> acceptWord, CollectionValuedMap <int, string>
                                                            feat)
        {
            // Do a depth first search
            ICollection <IndexedWord> descendantSet = new HashSet <IndexedWord>();

            if (doNotAddThese != null && doNotAddThese.Contains(vertex))
            {
                return(descendantSet);
            }
            if (!acceptWord.Test(vertex.BackingLabel()))
            {
                return(descendantSet);
            }
            DescendantsHelper(g, vertex, descendantSet, allCutOffRels, doNotAddThese, new List <IndexedWord>(), ignoreCommonTags, acceptWord, feat);
            //    String descStr = "";
            //    for(IndexedWord descendant: descendantSet){
            //      descStr += descendant.word()+" ";
            //    }
            //    System.out.println(descStr);
            return(descendantSet);
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
 /// <exception cref="System.Exception"/>
 private static void DescendantsHelper(SemanticGraph g, IndexedWord curr, ICollection <IndexedWord> descendantSet, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, IList <IndexedWord> seenNodes, bool ignoreCommonTags, IPredicate <CoreLabel
                                                                                                                                                                                                                                                      > acceptWord, CollectionValuedMap <int, string> feat)
 {
     if (seenNodes.Contains(curr))
     {
         return;
     }
     seenNodes.Add(curr);
     if (descendantSet.Contains(curr) || (doNotAddThese != null && doNotAddThese.Contains(curr)) || !acceptWord.Test(curr.BackingLabel()))
     {
         return;
     }
     if (!ignoreCommonTags || !ignoreTags.Contains(curr.Tag().Trim()))
     {
         descendantSet.Add(curr);
     }
     foreach (IndexedWord child in g.GetChildren(curr))
     {
         bool dontuse = false;
         if (doNotAddThese != null && doNotAddThese.Contains(child))
         {
             dontuse = true;
         }
         GrammaticalRelation rel = null;
         if (dontuse == false)
         {
             rel     = g.Reln(curr, child);
             dontuse = CheckIfSatisfiesRelConstrains(g, curr, child, allCutOffRels, rel);
         }
         if (dontuse == false)
         {
             foreach (string cutOffTagRegex in cutoffTags)
             {
                 if (child.Tag().Matches(cutOffTagRegex))
                 {
                     if (Debug >= 5)
                     {
                         System.Console.Out.WriteLine("ignored tag " + child + " because it satisfied " + cutOffTagRegex);
                     }
                     dontuse = true;
                     break;
                 }
             }
         }
         if (dontuse == false)
         {
             if (!feat.Contains(curr.Index()))
             {
                 feat[curr.Index()] = new List <string>();
             }
             GetPatternsFromDataMultiClass.GetFeatures(g, curr, false, feat[curr.Index()], rel);
             //feat.add(curr.index(), "REL-" + rel.getShortName());
             DescendantsHelper(g, child, descendantSet, allCutOffRels, doNotAddThese, seenNodes, ignoreCommonTags, acceptWord, feat);
         }
     }
 }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
            TwoDimensionalCounter <Pair <string, string>, E>    allFreq            = new TwoDimensionalCounter <Pair <string, string>, E>();

            foreach (string sentid in sentids)
            {
                IList <CoreLabel> sent = sents[sentid].GetTokens();
                //FIND_ALL is faster than FIND_NONOVERLAP
                IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll);
                foreach (ISequenceMatchResult <ICoreMap> m in matched)
                {
                    int s          = m.Start("$term");
                    int e          = m.End("$term");
                    E   matchedPat = patterns[m.Pattern()];
                    matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e));
                    string phrase            = string.Empty;
                    string phraseLemma       = string.Empty;
                    bool   useWordNotLabeled = false;
                    bool   doNotUse          = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords)
                    {
                        for (int i = s - 1; i >= 0; i--)
                        {
                            if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i_1 = e; i_1 < sent.Count; i_1++)
                        {
                            if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                e = i_1;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    bool[] addedindices = new bool[e - s];
                    // Arrays.fill(addedindices, false); // unneeded as done on initialization
                    for (int i_2 = s; i_2 < e; i_2++)
                    {
                        CoreLabel l = sent[i_2];
                        l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                        if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)))
                        {
                            l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                        }
                        l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat);
                        // if (restrictToMatched) {
                        // tokensMatchedPattern.add(sentid, i);
                        // }
                        foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                        {
                            if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                            {
                                doNotUse = true;
                            }
                        }
                        bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop)
                        {
                            doNotUse = true;
                        }
                        else
                        {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases)
                            {
                                if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString()))
                                {
                                    useWordNotLabeled = true;
                                }
                                phrase               += " " + l.Word();
                                phraseLemma          += " " + l.Lemma();
                                addedindices[i_2 - s] = true;
                            }
                        }
                    }
                    for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                    {
                        if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                        {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse && useWordNotLabeled)
                    {
                        phrase      = phrase.Trim();
                        phraseLemma = phraseLemma.Trim();
                        allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0);
                    }
                }
            }
            //      for (SurfacePattern pat : patterns.keySet()) {
            //        String patternStr = pat.toString();
            //
            //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
            //        if (pat == null || p == null)
            //          throw new RuntimeException("why is the pattern " + pat + " null?");
            //
            //        TokenSequenceMatcher m = p.getMatcher(sent);
            //        while (m.find()) {
            //
            //          int s = m.start("$term");
            //          int e = m.end("$term");
            //
            //          String phrase = "";
            //          String phraseLemma = "";
            //          boolean useWordNotLabeled = false;
            //          boolean doNotUse = false;
            //          for (int i = s; i < e; i++) {
            //            CoreLabel l = sent.get(i);
            //            l.set(PatternsAnnotations.MatchedPattern.class, true);
            //            if (restrictToMatched) {
            //              tokensMatchedPattern.add(sentid, i);
            //            }
            //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
            //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
            //                doNotUse = true;
            //              }
            //            }
            //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
            //            if (removePhrasesWithStopWords && containsStop) {
            //              doNotUse = true;
            //            } else {
            //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
            //
            //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
            //                  useWordNotLabeled = true;
            //                }
            //                phrase += " " + l.word();
            //                phraseLemma += " " + l.lemma();
            //
            //              }
            //            }
            //          }
            //          if (!doNotUse && useWordNotLabeled) {
            //            phrase = phrase.trim();
            //            phraseLemma = phraseLemma.trim();
            //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
            //          }
            //        }
            //      }
            return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }