// TODO: Check that this does what we want for Datum other than RVFDatum
 public override void Add(IDatum <L, F> d)
 {
     // If you edit me, also take care of WeightedRVFDataset
     if (d is RVFDatum <object, object> )
     {
         AddLabel(d.Label());
         AddFeatures(((RVFDatum <L, F>)d).AsFeaturesCounter());
         size++;
     }
     else
     {
         AddLabel(d.Label());
         AddFeatures(Counters.AsCounter(d.AsFeatures()));
         size++;
     }
 }
 // If you edit me, also take care of WeightedRVFDataset
 public virtual void Add(IDatum <L, F> d, string src, string id)
 {
     if (d is RVFDatum <object, object> )
     {
         AddLabel(d.Label());
         AddFeatures(((RVFDatum <L, F>)d).AsFeaturesCounter());
         AddSourceAndId(src, id);
         size++;
     }
     else
     {
         AddLabel(d.Label());
         AddFeatures(Counters.AsCounter(d.AsFeatures()));
         AddSourceAndId(src, id);
         size++;
     }
 }
Beispiel #3
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
Beispiel #4
0
 private static RVFDatum <L, F> NewRVFDatum <L, F>(L label, params F[] items)
 {
     return(new RVFDatum <L, F>(Counters.AsCounter(Arrays.AsList(items)), label));
 }
 //Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
 public virtual void PrintSubGraph(SemanticGraph g, IndexedWord w, IList <string> additionalCutOffRels, IList <string> textTokens, ICollection <string> listOfOutput, ICollection <IntPair> listOfOutputIndices, IList <IndexedWord> seenNodes, IList <IndexedWord
                                                                                                                                                                                                                                                       > doNotAddThese, bool findSubTrees, ICollection <ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, IPredicate <CoreLabel> acceptWord)
 {
     try
     {
         if (seenNodes.Contains(w))
         {
             return;
         }
         seenNodes.Add(w);
         if (doNotAddThese.Contains(w))
         {
             return;
         }
         IList <IndexedWord> andNodes = new List <IndexedWord>();
         DescendantsWithReln(g, w, "conj_and", new List <IndexedWord>(), andNodes);
         //System.out.println("and nodes are " + andNodes);
         foreach (IndexedWord w1 in andNodes)
         {
             PrintSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
         }
         Sharpen.Collections.AddAll(doNotAddThese, andNodes);
         IList <string> allCutOffRels = new List <string>();
         if (additionalCutOffRels != null)
         {
             Sharpen.Collections.AddAll(allCutOffRels, additionalCutOffRels);
         }
         Sharpen.Collections.AddAll(allCutOffRels, cutoffRelations);
         CollectionValuedMap <int, string> featPerToken = new CollectionValuedMap <int, string>();
         ICollection <string> feat = new List <string>();
         GetPatternsFromDataMultiClass.GetFeatures(g, w, true, feat, null);
         ICollection <IndexedWord> words = Descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);
         // words.addAll(andNodes);
         // if (includeSiblings == true) {
         // for (IndexedWord ws : g.getSiblings(w)) {
         // if (additionalCutOffNodes == null
         // || !additionalCutOffNodes.contains(g.reln(g.getParent(w),
         // ws).getShortName()))
         // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese));
         // }
         // }
         // if(afterand != null){
         // Set<IndexedWord> wordsAnd = descendants(g,afterand,
         // additionalCutOffNodes);
         // words.removeAll(wordsAnd);
         // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes);
         // }
         //System.out.println("words are " + words);
         if (words.Count > 0)
         {
             int min = int.MaxValue;
             int max = -1;
             foreach (IndexedWord word in words)
             {
                 if (word.Index() < min)
                 {
                     min = word.Index();
                 }
                 if (word.Index() > max)
                 {
                     max = word.Index();
                 }
             }
             IntPair indices;
             // Map<Integer, String> ph = new TreeMap<Integer, String>();
             // String phrase = "";
             // for (IndexedWord word : words) {
             // ph.put(word.index(), word.value());
             // }
             // phrase = StringUtils.join(ph.values(), " ");
             if ((max - min + 1) > maxPhraseLength)
             {
                 max = min + maxPhraseLength - 1;
             }
             indices = new IntPair(min - 1, max - 1);
             string phrase = StringUtils.Join(textTokens.SubList(min - 1, max), " ");
             phrase = phrase.Trim();
             feat.Add("LENGTH-" + (max - min + 1));
             for (int i = min; i <= max; i++)
             {
                 Sharpen.Collections.AddAll(feat, featPerToken[i]);
             }
             //System.out.println("phrase is " + phrase  + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
             ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.AsCounter(feat));
             if (!listOfOutput.Contains(phrase) && !doNotAddThese.Contains(phrase))
             {
                 //          if (sentElem != null) {
                 //            Element node = new Element(elemString, curNS);
                 //            node.addContent(phrase);
                 //            sentElem.addContent(node);
                 //          }
                 listOfOutput.Add(phrase);
                 if (!listOfOutputIndices.Contains(indices))
                 {
                     listOfOutputIndices.Add(indices);
                     extractedPhrases.Add(extractedPh);
                 }
                 if (findSubTrees == true)
                 {
                     foreach (IndexedWord word_1 in words)
                     {
                         if (!seenNodes.Contains(word_1))
                         {
                             PrintSubGraph(g, word_1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
                         }
                     }
                 }
             }
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }