/// <summary>Converts a tree to the Morfette training format.</summary>
        private static string TreeToMorfette(Tree tree)
        {
            StringBuilder  sb       = new StringBuilder();
            IList <ILabel> yield    = tree.Yield();
            IList <ILabel> tagYield = tree.PreTerminalYield();

            System.Diagnostics.Debug.Assert(yield.Count == tagYield.Count);
            int listLen = yield.Count;

            for (int i = 0; i < listLen; ++i)
            {
                CoreLabel token    = (CoreLabel)yield[i];
                CoreLabel tag      = (CoreLabel)tagYield[i];
                string    morphStr = token.OriginalText();
                if (morphStr == null || morphStr.Equals(string.Empty))
                {
                    morphStr = tag.Value();
                }
                string lemma = token.Lemma();
                if (lemma == null || lemma.Equals(string.Empty))
                {
                    lemma = token.Value();
                }
                sb.Append(string.Format("%s %s %s%n", token.Value(), lemma, morphStr));
            }
            return(sb.ToString());
        }
 private static bool ContainsStopWord(CoreLabel l, ICollection <string> commonEngWords, Pattern ignoreWordRegex)
 {
     // if(useWordResultCache.containsKey(l.word()))
     // return useWordResultCache.get(l.word());
     if ((commonEngWords != null && (commonEngWords.Contains(l.Lemma()) || commonEngWords.Contains(l.Word()))) || (ignoreWordRegex != null && ignoreWordRegex.Matcher(l.Lemma()).Matches()))
     {
         //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) {
         // useWordResultCache.putIfAbsent(l.word(), false);
         return(true);
     }
     //
     // if (l.word().length() >= minLen4Fuzzy) {
     // try {
     // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords,
     // l.word(), minLen4Fuzzy);
     // if (matchedFuzzy != null) {
     // synchronized (commonEngWords) {
     // commonEngWords.add(l.word());
     // System.out.println("word is " + l.word() + " and matched fuzzy with " +
     // matchedFuzzy);
     // }
     // useWordResultCache.putIfAbsent(l.word(), false);
     // return false;
     // }
     // } catch (Exception e) {
     // e.printStackTrace();
     // System.out.println("Exception " + " while fuzzy matching " + l.word());
     // }
     // }
     // useWordResultCache.putIfAbsent(l.word(), true);
     return(false);
 }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
Exemplo n.º 4
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile     = args[0];
            string             morfetteFile = args[1];
            ITreeReaderFactory trf          = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile);
                for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();)
                {
                    IList <CoreLabel> analysis = morfetteItr.Current;
                    IList <ILabel>    yield    = tree.Yield();
                    System.Diagnostics.Debug.Assert(analysis.Count == yield.Count);
                    int yieldLen = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel tokenAnalysis = analysis[i];
                        ILabel    token         = yield[i];
                        string    lemma         = GetLemma(token.Value(), tokenAnalysis.Lemma());
                        string    newLeaf       = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag());
                        ((CoreLabel)token).SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                if (tr.ReadTree() != null || morfetteItr.MoveNext())
                {
                    log.Info("WARNING: Uneven input files!");
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        public virtual void TestCoreLabelSetWordBehavior()
        {
            CoreLabel foo = new CoreLabel();

            foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo");
            foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B");
            foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool");
            // Lemma gets removed with word
            ArrayCoreMap copy = new ArrayCoreMap(foo);

            NUnit.Framework.Assert.AreEqual(copy, foo);
            foo.SetWord("foo");
            NUnit.Framework.Assert.AreEqual(copy, foo);
            // same word set
            foo.SetWord("bar");
            NUnit.Framework.Assert.IsFalse(copy.Equals(foo));
            // lemma removed
            foo.SetWord("foo");
            NUnit.Framework.Assert.IsFalse(copy.Equals(foo));
            // still removed
            foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool");
            NUnit.Framework.Assert.AreEqual(copy, foo);
            // back to normal
            // Hash code is consistent
            int hashCode = foo.GetHashCode();

            NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode);
            foo.SetWord("bar");
            NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode());
            foo.SetWord("foo");
            NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode());
            // Hash code doesn't care between a value of null and the key not existing
            NUnit.Framework.Assert.IsTrue(foo.Lemma() == null);
            int lemmalessHashCode = foo.GetHashCode();

            foo.Remove(typeof(CoreAnnotations.LemmaAnnotation));
            NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode());
            foo.SetLemma(null);
            NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode());
            foo.SetLemma("fool");
            NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode());
            // Check equals
            foo.SetWord("bar");
            foo.SetWord("foo");
            ArrayCoreMap nulledCopy = new ArrayCoreMap(foo);

            NUnit.Framework.Assert.AreEqual(nulledCopy, foo);
            foo.Remove(typeof(CoreAnnotations.LemmaAnnotation));
            NUnit.Framework.Assert.AreEqual(nulledCopy, foo);
        }
        /// <exception cref="System.IO.IOException"/>
        public virtual void HandleLemma(string arg, OutputStream outStream)
        {
            if (arg == null)
            {
                return;
            }
            IList <CoreLabel>  tokens = parser.Lemmatize(arg);
            OutputStreamWriter osw    = new OutputStreamWriter(outStream, "utf-8");

            for (int i = 0; i < tokens.Count; ++i)
            {
                CoreLabel word = tokens[i];
                if (i > 0)
                {
                    osw.Write(" ");
                }
                osw.Write(word.Lemma());
            }
            osw.Write("\n");
            osw.Flush();
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
 private static bool LemmaExists(CoreLabel l)
 {
     return(l.Lemma() != null && !l.Lemma().IsEmpty());
 }
Exemplo n.º 9
0
 /*
  * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
  * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
  * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
  * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
  * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
  * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");
  *
  * for(Entry<Integer, Double> en: patterns.entrySet()){
  * Integer pindex = en.getKey();
  * SurfacePattern p = constVars.getPatternIndex().get(pindex);
  * String[] n = p.getSimplerTokensNext();
  * String[] pr = p.getSimplerTokensPrev();
  * boolean rest = false;
  * if(n!=null){
  * for(String e: n){
  * if(!specialWords.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest == false && pr!=null){
  * for(String e: pr){
  * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest)
  * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
  * else
  * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
  * }
  *
  *
  *
  * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());
  *
  * if (constVars.batchProcessSents) {
  * List<File> filesToLoad;
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
  * filesToLoad = Data.sentsFiles;
  * else{
  * filesToLoad = new ArrayList<File>();
  * for (String fname : sentidswithfilerest.keySet()) {
  * String filename;
  * //          if(!constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname;
  * //          else
  * filename = fname;
  * filesToLoad.add(new File(filename));
  * }
  * }
  *
  * for (File fname : filesToLoad) {
  * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname);
  *
  * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){
  *
  * String filename;
  * //          if(constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
  * //          else
  * filename = fname.getAbsolutePath();
  *
  * Set<String> sentIDs = sentidswithfilerest.get(filename);
  * if (sentIDs != null){
  * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  *
  * if (computeDataFreq){
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
  * }
  * }
  *
  * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
  * if(computeDataFreq){
  * for(File f: Data.sentsFiles){
  * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(f.getName());
  * }
  * }
  * }
  *
  * } else {
  *
  * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
  * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
  * Set<String> sentids = sentidswithfilerest.get(filename);
  * if (sentids != null) {
  * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
  * }
  * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  * }
  */
 private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted)
 {
     foreach (KeyValuePair <string, DataInstance> sentEn in sents)
     {
         IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key);
         if (pat4Sent == null)
         {
             throw new Exception("How come there are no patterns for " + sentEn.Key);
         }
         foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent)
         {
             CoreLabel       token = null;
             ICollection <E> p1    = en.Value;
             //        Set<Integer> p1 = en.getValue().first();
             //        Set<Integer> p2 = en.getValue().second();
             //        Set<Integer> p3 = en.getValue().third();
             foreach (E index in patternsLearnedThisIter.KeySet())
             {
                 if (p1.Contains(index))
                 {
                     if (token == null)
                     {
                         token = sentEn.Value.GetTokens()[en.Key];
                     }
                     wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index);
                 }
             }
         }
     }
 }
        public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves)
        {
            IList <ILabel> labels = tree.Yield();

            foreach (ILabel label in labels)
            {
                ++nTokens;
                if (!(label is CoreLabel))
                {
                    throw new ArgumentException("Only works with CoreLabels trees");
                }
                CoreLabel coreLabel = (CoreLabel)label;
                string    lemma     = coreLabel.Lemma();
                //PTB escaping since we're going to put this in the leaf
                if (lemma == null)
                {
                    // No lemma, so just add the surface form
                    lemma = coreLabel.Word();
                }
                else
                {
                    if (lemma.Equals("("))
                    {
                        lemma = "-LRB-";
                    }
                    else
                    {
                        if (lemma.Equals(")"))
                        {
                            lemma = "-RRB-";
                        }
                    }
                }
                if (lemmasAsLeaves)
                {
                    string escapedLemma = lemma;
                    coreLabel.SetWord(escapedLemma);
                    coreLabel.SetValue(escapedLemma);
                    coreLabel.SetLemma(lemma);
                }
                if (addMorphoToLeaves)
                {
                    string morphStr = coreLabel.OriginalText();
                    if (morphStr == null || morphStr.Equals(string.Empty))
                    {
                        morphStr = MorphoFeatureSpecification.NoAnalysis;
                    }
                    else
                    {
                        ++nMorphAnalyses;
                    }
                    // Normalize punctuation analyses
                    if (morphStr.StartsWith("PONCT"))
                    {
                        morphStr = "PUNC";
                    }
                    string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr);
                    coreLabel.SetValue(newLeaf);
                    coreLabel.SetWord(newLeaf);
                }
            }
        }
Exemplo n.º 11
0
 /// <seealso cref="ConjugateEnglish(string, bool)"/>
 public string ConjugateEnglish(CoreLabel token)
 {
     return(ConjugateEnglish(Optional.OfNullable(token.Lemma()).OrElse(token.Word()), false));
 }
        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
            // CollectionValuedMap<String, Integer>();
            TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();

            foreach (string sentid in sentids)
            {
                DataInstance      sent   = sents[sentid];
                IList <CoreLabel> tokens = sent.GetTokens();
                foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns)
                {
                    if (pEn.Key == null)
                    {
                        throw new Exception("why is the pattern " + pEn + " null?");
                    }
                    SemanticGraph graph = ((DataInstanceDep)sent).GetGraph();
                    //SemgrexMatcher m = pEn.getKey().matcher(graph);
                    //TokenSequenceMatcher m = pEn.getKey().matcher(sent);
                    //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                    //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                    //Higher branch values makes the faster but uses more memory
                    //m.setBranchLimit(5);
                    ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label);
                    foreach (ExtractedPhrase match in matched)
                    {
                        int    s                 = match.startIndex;
                        int    e                 = match.endIndex + 1;
                        string phrase            = string.Empty;
                        string phraseLemma       = string.Empty;
                        bool   useWordNotLabeled = false;
                        bool   doNotUse          = false;
                        //find if the neighboring words are labeled - if so - club them together
                        if (constVars.clubNeighboringLabeledWords)
                        {
                            for (int i = s - 1; i >= 0; i--)
                            {
                                if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    s = i;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s);
                                    break;
                                }
                            }
                            for (int i_1 = e; i_1 < tokens.Count; i_1++)
                            {
                                if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label])
                                {
                                    e = i_1;
                                }
                                else
                                {
                                    //System.out.println("for phrase " + match + " clubbing next word. new e is " + e);
                                    break;
                                }
                            }
                        }
                        //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                        bool[] addedindices = new bool[e - s];
                        // Arrays.fill(addedindices, false); // get for free on array initialization
                        for (int i_2 = s; i_2 < e; i_2++)
                        {
                            CoreLabel l = tokens[i_2];
                            l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                            if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                            {
                                l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                            }
                            Pattern pSur = pEn.Value;
                            System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                            System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                            l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                            foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                            {
                                if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                                {
                                    doNotUse = true;
                                }
                            }
                            bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                            if (removePhrasesWithStopWords && containsStop)
                            {
                                doNotUse = true;
                            }
                            else
                            {
                                if (!containsStop || !removeStopWordsFromSelectedPhrases)
                                {
                                    if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                    {
                                        useWordNotLabeled = true;
                                    }
                                    phrase               += " " + l.Word();
                                    phraseLemma          += " " + l.Lemma();
                                    addedindices[i_2 - s] = true;
                                }
                            }
                        }
                        for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                        {
                            if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                            {
                                doNotUse = true;
                                break;
                            }
                        }
                        if (!doNotUse && useWordNotLabeled)
                        {
                            matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                            if (useWordNotLabeled)
                            {
                                phrase      = phrase.Trim();
                                phraseLemma = phraseLemma.Trim();
                                allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0);
                            }
                        }
                    }
                }
            }
            return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }
        /// <exception cref="System.Exception"/>
        public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call()
        {
            //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>();
            CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
            TwoDimensionalCounter <Pair <string, string>, E>    allFreq            = new TwoDimensionalCounter <Pair <string, string>, E>();

            foreach (string sentid in sentids)
            {
                IList <CoreLabel> sent = sents[sentid].GetTokens();
                //FIND_ALL is faster than FIND_NONOVERLAP
                IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll);
                foreach (ISequenceMatchResult <ICoreMap> m in matched)
                {
                    int s          = m.Start("$term");
                    int e          = m.End("$term");
                    E   matchedPat = patterns[m.Pattern()];
                    matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e));
                    string phrase            = string.Empty;
                    string phraseLemma       = string.Empty;
                    bool   useWordNotLabeled = false;
                    bool   doNotUse          = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords)
                    {
                        for (int i = s - 1; i >= 0; i--)
                        {
                            if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i_1 = e; i_1 < sent.Count; i_1++)
                        {
                            if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                            {
                                e = i_1;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    bool[] addedindices = new bool[e - s];
                    // Arrays.fill(addedindices, false); // unneeded as done on initialization
                    for (int i_2 = s; i_2 < e; i_2++)
                    {
                        CoreLabel l = sent[i_2];
                        l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                        if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)))
                        {
                            l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                        }
                        l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat);
                        // if (restrictToMatched) {
                        // tokensMatchedPattern.add(sentid, i);
                        // }
                        foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                        {
                            if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                            {
                                doNotUse = true;
                            }
                        }
                        bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop)
                        {
                            doNotUse = true;
                        }
                        else
                        {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases)
                            {
                                if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString()))
                                {
                                    useWordNotLabeled = true;
                                }
                                phrase               += " " + l.Word();
                                phraseLemma          += " " + l.Lemma();
                                addedindices[i_2 - s] = true;
                            }
                        }
                    }
                    for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                    {
                        if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                        {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse && useWordNotLabeled)
                    {
                        phrase      = phrase.Trim();
                        phraseLemma = phraseLemma.Trim();
                        allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0);
                    }
                }
            }
            //      for (SurfacePattern pat : patterns.keySet()) {
            //        String patternStr = pat.toString();
            //
            //        TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
            //        if (pat == null || p == null)
            //          throw new RuntimeException("why is the pattern " + pat + " null?");
            //
            //        TokenSequenceMatcher m = p.getMatcher(sent);
            //        while (m.find()) {
            //
            //          int s = m.start("$term");
            //          int e = m.end("$term");
            //
            //          String phrase = "";
            //          String phraseLemma = "";
            //          boolean useWordNotLabeled = false;
            //          boolean doNotUse = false;
            //          for (int i = s; i < e; i++) {
            //            CoreLabel l = sent.get(i);
            //            l.set(PatternsAnnotations.MatchedPattern.class, true);
            //            if (restrictToMatched) {
            //              tokensMatchedPattern.add(sentid, i);
            //            }
            //            for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) {
            //              if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
            //                doNotUse = true;
            //              }
            //            }
            //            boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords);
            //            if (removePhrasesWithStopWords && containsStop) {
            //              doNotUse = true;
            //            } else {
            //              if (!containsStop || !removeStopWordsFromSelectedPhrases) {
            //
            //                if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) {
            //                  useWordNotLabeled = true;
            //                }
            //                phrase += " " + l.word();
            //                phraseLemma += " " + l.lemma();
            //
            //              }
            //            }
            //          }
            //          if (!doNotUse && useWordNotLabeled) {
            //            phrase = phrase.trim();
            //            phraseLemma = phraseLemma.trim();
            //            allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0);
            //          }
            //        }
            //      }
            return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat));
        }
Exemplo n.º 15
0
        public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords)
        {
            ICollection <SurfacePattern> prevpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> nextpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>();
            CoreLabel token = sent[i];
            string    tag   = null;

            if (usePOS4Pattern)
            {
                string fulltag = token.Tag();
                if (useCoarsePOS)
                {
                    tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2));
                }
                else
                {
                    tag = fulltag;
                }
            }
            string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));

            for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++)
            {
                IList <Token>  previousTokens   = new List <Token>();
                IList <string> originalPrev     = new List <string>();
                IList <string> originalNext     = new List <string>();
                IList <Token>  nextTokens       = new List <Token>();
                int            numStopWordsprev = 0;
                int            numStopWordsnext = 0;
                // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
                int          numNonStopWordsNext = 0;
                int          numNonStopWordsPrev = 0;
                bool         useprev             = false;
                bool         usenext             = false;
                PatternToken twithoutPOS         = null;
                //TODO: right now using numWordsCompoundMax.
                if (addPatWithoutPOS)
                {
                    twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                PatternToken twithPOS = null;
                if (usePOS4Pattern)
                {
                    twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                if (usePreviousContext)
                {
                    // int j = Math.max(0, i - 1);
                    int j         = i - 1;
                    int numTokens = 0;
                    while (numTokens < maxWin && j >= 0)
                    {
                        // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j--;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException("how come the class "
                        //                + answerClass.get(label) + " for token "
                        //                + tokenj.word() + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        if (!isLabeledO)
                        {
                            // numPrevTokensSpecial++;
                            previousTokens.Add(0, strgeneric);
                            // previousTokens.add(0,
                            // "[{answer:"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalPrev.Add(0, strOriginal);
                            numNonStopWordsPrev++;
                        }
                        else
                        {
                            if (tokenj.Word().StartsWith("http"))
                            {
                                useprev = false;
                                previousTokens.Clear();
                                originalPrev.Clear();
                                break;
                            }
                            else
                            {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                previousTokens.Add(0, str);
                                originalPrev.Add(0, tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsprev++;
                                }
                                else
                                {
                                    numNonStopWordsPrev++;
                                }
                            }
                        }
                        numTokens++;
                        j--;
                    }
                }
                if (useNextContext)
                {
                    int numTokens = 0;
                    int j         = i + 1;
                    while (numTokens < maxWin && j < sent.Count)
                    {
                        // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j++;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException(
                        //                "how come the dict annotation for token " + tokenj.word()
                        //                    + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        // boolean isLabeledO = tokenj.get(answerClass.get(label))
                        // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
                        if (!isLabeledO)
                        {
                            // numNextTokensSpecial++;
                            numNonStopWordsNext++;
                            nextTokens.Add(strgeneric);
                            // nextTokens.add("[{" + label + ":"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalNext.Add(strOriginal);
                        }
                        else
                        {
                            // originalNextStr += " "
                            // + tokenj.get(answerClass.get(label)).toString();
                            if (tokenj.Word().StartsWith("http"))
                            {
                                usenext = false;
                                nextTokens.Clear();
                                originalNext.Clear();
                                break;
                            }
                            else
                            {
                                // if (!tokenj.word().matches("[.,?()]")) {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                nextTokens.Add(str);
                                originalNext.Add(tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsnext++;
                                }
                                else
                                {
                                    numNonStopWordsNext++;
                                }
                            }
                        }
                        j++;
                        numTokens++;
                    }
                }
                // String prevContext = null, nextContext = null;
                // int numNonSpecialPrevTokens = previousTokens.size()
                // - numPrevTokensSpecial;
                // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;
                Token[] prevContext = null;
                //String[] prevContext = null;
                //String[] prevOriginalArr = null;
                // if (previousTokens.size() >= minWindow4Pattern
                // && (numStopWordsprev < numNonSpecialPrevTokens ||
                // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
                if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd))
                {
                    // prevContext = StringUtils.join(previousTokens, fw);
                    IList <Token>  prevContextList = new List <Token>();
                    IList <string> prevOriginal    = new List <string>();
                    foreach (Token p in previousTokens)
                    {
                        prevContextList.Add(p);
                        if (!fw.IsEmpty())
                        {
                            prevContextList.Add(fw);
                        }
                    }
                    // add fw and sw to the the originalprev
                    foreach (string p_1 in originalPrev)
                    {
                        prevOriginal.Add(p_1);
                        if (!fw.IsEmpty())
                        {
                            prevOriginal.Add(" FW ");
                        }
                    }
                    if (!sw.IsEmpty())
                    {
                        prevContextList.Add(sw);
                        prevOriginal.Add(" SW ");
                    }
                    // String str = prevContext + fw + sw;
                    if (IsASCII(StringUtils.Join(prevOriginal)))
                    {
                        prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]);
                        //prevOriginalArr = prevOriginal.toArray(new String[0]);
                        if (previousTokens.Count >= minWindow4Pattern)
                        {
                            if (twithoutPOS != null)
                            {
                                SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(pat);
                            }
                            if (twithPOS != null)
                            {
                                SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(patPOS);
                            }
                        }
                        useprev = true;
                    }
                }
                Token[] nextContext = null;
                //String [] nextOriginalArr = null;
                // if (nextTokens.size() > 0
                // && (numStopWordsnext < numNonSpecialNextTokens ||
                // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
                if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd))
                {
                    // nextContext = StringUtils.join(nextTokens, fw);
                    IList <Token>  nextContextList = new List <Token>();
                    IList <string> nextOriginal    = new List <string>();
                    if (!sw.IsEmpty())
                    {
                        nextContextList.Add(sw);
                        nextOriginal.Add(" SW ");
                    }
                    foreach (Token n in nextTokens)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextContextList.Add(fw);
                        }
                        nextContextList.Add(n);
                    }
                    foreach (string n_1 in originalNext)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextOriginal.Add(" FW ");
                        }
                        nextOriginal.Add(n_1);
                    }
                    if (nextTokens.Count >= minWindow4Pattern)
                    {
                        nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]);
                        //nextOriginalArr =  nextOriginal.toArray(new String[0]);
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(patPOS);
                        }
                    }
                    usenext = true;
                }
                if (useprev && usenext)
                {
                    // String strprev = prevContext + fw + sw;
                    // String strnext = sw + fw + nextContext;
                    if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern)
                    {
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(patPOS);
                        }
                    }
                }
            }
            //    Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
            //        prevpatterns, nextpatterns, prevnextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prev patterns are " + prevpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " next patterns are " + nextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prevnext patterns are " + prevnextpatterns);
            //getPatternIndex().finishCommit();
            return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns));
        }