//line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]}
        /// <exception cref="System.Exception"/>
        public virtual string DoNewPhrases(string line)
        {
            System.Console.Out.WriteLine("adding new phrases");
            ConstantsAndVariables constVars  = new ConstantsAndVariables(props, humanLabelClasses.Keys, humanLabelClasses);
            IJsonReader           jsonReader = Javax.Json.Json.CreateReader(new StringReader(line));
            IJsonObject           objarr     = jsonReader.ReadObject();

            foreach (KeyValuePair <string, IJsonValue> o in objarr)
            {
                string label = o.Key;
                ICollection <CandidatePhrase> seed = new HashSet <CandidatePhrase>();
                IJsonArray arr = objarr.GetJsonArray(o.Key);
                for (int i = 0; i < arr.Count; i++)
                {
                    string seedw = arr.GetString(i);
                    System.Console.Out.WriteLine("adding " + seedw + " to seed ");
                    seed.Add(CandidatePhrase.CreateOrGet(seedw));
                }
                Sharpen.Collections.AddAll(seedWords[label], seed);
                constVars.AddSeedWords(label, seed);
                GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seed, constVars, false);
            }
            //model.labelWords(label, labelclass, Data.sents, seed);
            return("SUCCESS added new phrases");
        }
示例#2
0
 /*
  * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
  * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
  * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
  * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
  * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
  * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");
  *
  * for(Entry<Integer, Double> en: patterns.entrySet()){
  * Integer pindex = en.getKey();
  * SurfacePattern p = constVars.getPatternIndex().get(pindex);
  * String[] n = p.getSimplerTokensNext();
  * String[] pr = p.getSimplerTokensPrev();
  * boolean rest = false;
  * if(n!=null){
  * for(String e: n){
  * if(!specialWords.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest == false && pr!=null){
  * for(String e: pr){
  * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest)
  * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
  * else
  * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
  * }
  *
  *
  *
  * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());
  *
  * if (constVars.batchProcessSents) {
  * List<File> filesToLoad;
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
  * filesToLoad = Data.sentsFiles;
  * else{
  * filesToLoad = new ArrayList<File>();
  * for (String fname : sentidswithfilerest.keySet()) {
  * String filename;
  * //          if(!constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname;
  * //          else
  * filename = fname;
  * filesToLoad.add(new File(filename));
  * }
  * }
  *
  * for (File fname : filesToLoad) {
  * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname);
  *
  * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){
  *
  * String filename;
  * //          if(constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
  * //          else
  * filename = fname.getAbsolutePath();
  *
  * Set<String> sentIDs = sentidswithfilerest.get(filename);
  * if (sentIDs != null){
  * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  *
  * if (computeDataFreq){
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
  * }
  * }
  *
  * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
  * if(computeDataFreq){
  * for(File f: Data.sentsFiles){
  * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(f.getName());
  * }
  * }
  * }
  *
  * } else {
  *
  * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
  * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
  * Set<String> sentids = sentidswithfilerest.get(filename);
  * if (sentids != null) {
  * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
  * }
  * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  * }
  */
 private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted)
 {
     foreach (KeyValuePair <string, DataInstance> sentEn in sents)
     {
         IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key);
         if (pat4Sent == null)
         {
             throw new Exception("How come there are no patterns for " + sentEn.Key);
         }
         foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent)
         {
             CoreLabel       token = null;
             ICollection <E> p1    = en.Value;
             //        Set<Integer> p1 = en.getValue().first();
             //        Set<Integer> p2 = en.getValue().second();
             //        Set<Integer> p3 = en.getValue().third();
             foreach (E index in patternsLearnedThisIter.KeySet())
             {
                 if (p1.Contains(index))
                 {
                     if (token == null)
                     {
                         token = sentEn.Value.GetTokens()[en.Key];
                     }
                     wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index);
                 }
             }
         }
     }
 }
示例#3
0
 public static void ComputeRawFreqIfNull(IDictionary <string, DataInstance> sents, int numWordsCompound)
 {
     Redwood.Log(Redwood.Dbg, "Computing raw freq for every 1-" + numWordsCompound + " consecutive words");
     foreach (DataInstance l in sents.Values)
     {
         IList <IList <CoreLabel> > ngrams = CollectionUtils.GetNGrams(l.GetTokens(), 1, numWordsCompound);
         foreach (IList <CoreLabel> n in ngrams)
         {
             string s = string.Empty;
             foreach (CoreLabel c in n)
             {
                 // if (useWord(c, commonEngWords, ignoreWordRegex)) {
                 s += " " + c.Word();
             }
             // }
             s = s.Trim();
             if (!s.IsEmpty())
             {
                 Data.rawFreq.IncrementCount(CandidatePhrase.CreateOrGet(s));
             }
         }
     }
     //if (googleNGram != null && googleNGram.size() > 0)
     if (usingGoogleNgram)
     {
         SetRatioGoogleNgramFreqWithDataFreq();
     }
     if (domainNGramRawFreq != null && domainNGramRawFreq.Size() > 0)
     {
         ratioDomainNgramFreqWithDataFreq = domainNGramRawFreq.TotalCount() / Data.rawFreq.TotalCount();
     }
 }
 internal virtual double GetPatTFIDFScore(CandidatePhrase word, ICounter <E> patsThatExtractedThis, ICounter <E> allSelectedPatterns)
 {
     if (Data.processedDataFreq.GetCount(word) == 0.0)
     {
         Redwood.Log(Redwood.Warn, "How come the processed corpus freq has count of " + word + " 0. The count in raw freq is " + Data.rawFreq.GetCount(word) + " and the Data.rawFreq size is " + Data.rawFreq.Size());
         return(0);
     }
     else
     {
         double          total = 0;
         ICollection <E> rem   = new HashSet <E>();
         foreach (KeyValuePair <E, double> en2 in patsThatExtractedThis.EntrySet())
         {
             double weight = 1.0;
             if (usePatternWeights)
             {
                 weight = allSelectedPatterns.GetCount(en2.Key);
                 if (weight == 0)
                 {
                     Redwood.Log(Redwood.Force, "Warning: Weight zero for " + en2.Key + ". May be pattern was removed when choosing other patterns (if subsumed by another pattern).");
                     rem.Add(en2.Key);
                 }
             }
             total += weight;
         }
         Counters.RemoveKeys(patsThatExtractedThis, rem);
         double score = total / Data.processedDataFreq.GetCount(word);
         return(score);
     }
 }
示例#5
0
        public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords
                                                                 , double thresholdWordExtract)
        {
            IEnumerator <CandidatePhrase> termIter   = Counters.ToPriorityQueue(newdt).GetEnumerator();
            ICounter <CandidatePhrase>    finalwords = new ClassicCounter <CandidatePhrase>();

            while (termIter.MoveNext())
            {
                if (finalwords.Size() >= constVars.numWordsToAdd)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                if (newdt.GetCount(w) < thresholdWordExtract)
                {
                    Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of  " + thresholdWordExtract);
                    break;
                }
                System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity));
                if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied)
                {
                    Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet());
                    continue;
                }
                CandidatePhrase matchedFuzzy = null;
                if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
                {
                    matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
                }
                if (matchedFuzzy == null)
                {
                    Redwood.Log("extremePatDebug", "adding word " + w);
                    finalwords.SetCount(w, newdt.GetCount(w));
                }
                else
                {
                    Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word");
                    ignoreWords.Add(w);
                }
            }
            string nextTen = string.Empty;
            int    n       = 0;

            while (termIter.MoveNext())
            {
                n++;
                if (n > 10)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                nextTen += ";\t" + w + ":" + newdt.GetCount(w);
            }
            Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen);
            return(finalwords);
        }
        public virtual double GetDictOddsScore(CandidatePhrase word, string label, double defaultWt)
        {
            double dscore;
            ICounter <CandidatePhrase> dictOddsWordWeights = constVars.dictOddsWeights[label];

            System.Diagnostics.Debug.Assert(dictOddsWordWeights != null, "dictOddsWordWeights is null for label " + label);
            if (dictOddsWordWeights.ContainsKey(word))
            {
                dscore = dictOddsWordWeights.GetCount(word);
            }
            else
            {
                dscore = GetPhraseWeightFromWords(dictOddsWordWeights, word, defaultWt);
            }
            return(dscore);
        }
        public static double GetGoogleNgramScore(CandidatePhrase g)
        {
            double count = GoogleNGramsSQLBacked.GetCount(g.GetPhrase().ToLower()) + GoogleNGramsSQLBacked.GetCount(g.GetPhrase());

            if (count != -1)
            {
                if (!Data.rawFreq.ContainsKey(g))
                {
                    //returning 1 because usually lower this tf-idf score the better. if we don't have raw freq info, give it a bad score
                    return(1);
                }
                else
                {
                    return((1 + Data.rawFreq.GetCount(g) * Math.Sqrt(Data.ratioGoogleNgramFreqWithDataFreq)) / count);
                }
            }
            return(0);
        }
        public virtual double GetPhraseWeightFromWords(ICounter <CandidatePhrase> weights, CandidatePhrase ph, double defaultWt)
        {
            string[] t = ph.GetPhrase().Split("\\s+");
            if (t.Length < 2)
            {
                if (weights.ContainsKey(ph))
                {
                    return(weights.GetCount(ph));
                }
                else
                {
                    return(defaultWt);
                }
            }
            double totalscore = 0;
            double minScore   = double.MaxValue;

            foreach (string w in t)
            {
                double score = defaultWt;
                if (weights.ContainsKey(CandidatePhrase.CreateOrGet(w)))
                {
                    score = weights.GetCount(w);
                }
                if (score < minScore)
                {
                    minScore = score;
                }
                totalscore += score;
            }
            if (useAvgInsteadofMinPhraseScoring)
            {
                return(totalscore / ph.GetPhrase().Length);
            }
            else
            {
                return(minScore);
            }
        }
示例#9
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
示例#10
0
        private double NumNonRedundantPatterns(TwoDimensionalCounter <CandidatePhrase, E> terms, CandidatePhrase w)
        {
            object[] pats   = Sharpen.Collections.ToArray(terms.GetCounter(w).KeySet());
            int      numPat = 0;

            for (int i = 0; i < pats.Length; i++)
            {
                //String pati = constVars.getPatternIndex().get(pats[i]).toString();
                string pati     = pats[i].ToString();
                bool   contains = false;
                for (int j = i + 1; j < pats.Length; j++)
                {
                    //String patj = constVars.getPatternIndex().get(pats[j]).toString();
                    string patj = pats[j].ToString();
                    if (patj.Contains(pati) || pati.Contains(patj))
                    {
                        contains = true;
                        break;
                    }
                }
                if (!contains)
                {
                    numPat++;
                }
            }
            return(numPat);
        }
示例#11
0
 public static bool DoNotUse(string word, ICollection <CandidatePhrase> stopWords)
 {
     return(stopWords.Contains(CandidatePhrase.CreateOrGet(word.ToLower())) || ignoreWordRegex.Matcher(word).Matches());
 }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool
                                                     sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat)
        {
            //    if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) {
            //      Data.loadGoogleNGrams();
            //    }
            ICounter <E> patterns = new ClassicCounter <E>();
            ICounter <CandidatePhrase> googleNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores    = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>();
            double            externalWtsDefault = 0.5;
            ICounter <string> classifierScores   = null;

            if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
            {
                foreach (CandidatePhrase gc in allCandidatePhrases)
                {
                    string g = gc.GetPhrase();
                    if (constVars.usePatternEvalEditDistOther)
                    {
                        editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalEditDistSame)
                    {
                        editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalGoogleNgram)
                    {
                        googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc));
                    }
                    if (constVars.usePatternEvalDomainNgram)
                    {
                        // calculate domain-ngram wts
                        if (Data.domainNGramRawFreq.ContainsKey(g))
                        {
                            System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                            domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g));
                        }
                    }
                    if (constVars.usePatternEvalWordClass)
                    {
                        int num = constVars.GetWordClassClusters()[g];
                        if (num == null)
                        {
                            num = constVars.GetWordClassClusters()[g.ToLower()];
                        }
                        if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                        {
                            externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                        }
                        else
                        {
                            externalFeatWtsNormalized.SetCount(gc, externalWtsDefault);
                        }
                    }
                }
                if (constVars.usePatternEvalGoogleNgram)
                {
                    googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalDomainNgram)
                {
                    domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalWordClass)
                {
                    externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
                }
            }
            else
            {
                if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                {
                    Properties props2 = new Properties();
                    props2.PutAll(props);
                    props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt");
                    ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars);
                    System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile"));
                    ArgumentParser.FillOptions(typeof(Data), props2);
                    classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true);
                }
            }
            ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet())
            {
                foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet())
                {
                    CandidatePhrase word = en2.Key;
                    ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                    double score = 1;
                    if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
                    {
                        if (cachedScoresForThisIter.ContainsKey(word))
                        {
                            score = cachedScoresForThisIter.GetCount(word);
                        }
                        else
                        {
                            if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word))
                            {
                                score = 1;
                            }
                            else
                            {
                                if (constVars.usePatternEvalSemanticOdds)
                                {
                                    double semanticClassOdds = 1;
                                    if (dictOddsWordWeights.ContainsKey(word))
                                    {
                                        semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds);
                                }
                                if (constVars.usePatternEvalGoogleNgram)
                                {
                                    double gscore = 0;
                                    if (googleNgramNormScores.ContainsKey(word))
                                    {
                                        gscore = 1 - googleNgramNormScores.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore);
                                }
                                if (constVars.usePatternEvalDomainNgram)
                                {
                                    double domainscore;
                                    if (domainNgramNormScores.ContainsKey(word))
                                    {
                                        domainscore = 1 - domainNgramNormScores.GetCount(word);
                                    }
                                    else
                                    {
                                        domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                                }
                                if (constVars.usePatternEvalWordClass)
                                {
                                    double externalFeatureWt = externalWtsDefault;
                                    if (externalFeatWtsNormalized.ContainsKey(word))
                                    {
                                        externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                                }
                                if (constVars.usePatternEvalEditDistOther)
                                {
                                    System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty);
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word));
                                }
                                if (constVars.usePatternEvalEditDistSame)
                                {
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word));
                                }
                                // taking average
                                score = Counters.Mean(scoreslist);
                                phInPatScores.SetCounter(word, scoreslist);
                            }
                            cachedScoresForThisIter.SetCount(word, score);
                        }
                    }
                    else
                    {
                        if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                        {
                            score = 1 - classifierScores.GetCount(word);
                        }
                    }
                    // score = 1 - scorePhrases.scoreUsingClassifer(classifier,
                    // e.getKey(), label, true, null, null, dictOddsWordWeights);
                    // throw new RuntimeException("not implemented yet");
                    if (useFreqPhraseExtractedByPat)
                    {
                        score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word));
                    }
                    if (constVars.sqrtPatScore)
                    {
                        patterns.IncrementCount(en.Key, Math.Sqrt(score));
                    }
                    else
                    {
                        patterns.IncrementCount(en.Key, score);
                    }
                }
            }
            return(patterns);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual void SetUpProperties(string line, bool readFile, bool writeOutputToFile, string additionalSeedWordsFiles)
        {
            IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line));
            IJsonObject objarr     = jsonReader.ReadObject();

            jsonReader.Close();
            Properties props = new Properties();

            foreach (string o in objarr.Keys)
            {
                if (o.Equals("seedWords"))
                {
                    IJsonObject obj = objarr.GetJsonObject(o);
                    foreach (string st in obj.Keys)
                    {
                        seedWords[st] = new HashSet <CandidatePhrase>();
                        IJsonArray arr = obj.GetJsonArray(st);
                        for (int i = 0; i < arr.Count; i++)
                        {
                            string val = arr.GetString(i);
                            seedWords[st].Add(CandidatePhrase.CreateOrGet(val));
                            System.Console.Out.WriteLine("adding " + val + " for label " + st);
                        }
                    }
                }
                else
                {
                    props.SetProperty(o, objarr.GetString(o));
                }
            }
            System.Console.Out.WriteLine("seedwords are " + seedWords);
            if (additionalSeedWordsFiles != null && !additionalSeedWordsFiles.IsEmpty())
            {
                IDictionary <string, ICollection <CandidatePhrase> > additionalSeedWords = GetPatternsFromDataMultiClass.ReadSeedWords(additionalSeedWordsFiles);
                logger.Info("additional seed words are " + additionalSeedWords);
                foreach (string label in seedWords.Keys)
                {
                    if (additionalSeedWords.Contains(label))
                    {
                        Sharpen.Collections.AddAll(seedWords[label], additionalSeedWords[label]);
                    }
                }
            }
            outputFile = null;
            if (readFile)
            {
                System.Console.Out.WriteLine("input value is " + objarr.GetString("input"));
                outputFile = props.GetProperty("input") + "_processed";
                props.SetProperty("file", objarr.GetString("input"));
                if (writeOutputToFile && !props.Contains("columnOutputFile"))
                {
                    props.SetProperty("columnOutputFile", outputFile);
                }
            }
            else
            {
                string systemdir = Runtime.GetProperty("java.io.tmpdir");
                File   tempFile  = File.CreateTempFile("sents", ".tmp", new File(systemdir));
                tempFile.DeleteOnExit();
                IOUtils.WriteStringToFile(props.GetProperty("input"), tempFile.GetPath(), "utf8");
                props.SetProperty("file", tempFile.GetAbsolutePath());
            }
            SetProperties(props);
            this.props = props;
            int i_1 = 1;

            foreach (string label_1 in seedWords.Keys)
            {
                string ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i_1;
                Type   mcCl     = (Type)Sharpen.Runtime.GetType(ansclstr);
                machineAnswerClasses[label_1] = mcCl;
                string humanansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternHumanLabel" + i_1;
                humanLabelClasses[label_1] = (Type)Sharpen.Runtime.GetType(humanansclstr);
                i_1++;
            }
        }
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.MissingMethodException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="Java.Sql.SQLException"/>
        public virtual string SuggestPhrasesTest(Properties testProps, string modelPropertiesFile, string stopWordsFile)
        {
            logger.Info("Suggesting phrases in test");
            logger.Info("test properties are " + testProps);
            Properties runProps = StringUtils.ArgsToPropertiesWithResolve(new string[] { "-props", modelPropertiesFile });

            string[] removeProperties = new string[] { "allPatternsDir", "storePatsForEachToken", "invertedIndexClass", "savePatternsWordsDir", "batchProcessSents", "outDir", "saveInvertedIndex", "removeOverLappingLabels", "numThreads" };
            foreach (string s in removeProperties)
            {
                if (runProps.Contains(s))
                {
                    runProps.Remove(s);
                }
            }
            runProps.SetProperty("stopWordsPatternFiles", stopWordsFile);
            runProps.SetProperty("englishWordsFiles", stopWordsFile);
            runProps.SetProperty("commonWordsPatternFiles", stopWordsFile);
            runProps.PutAll(props);
            runProps.PutAll(testProps);
            props.PutAll(runProps);
            ProcessText(false);
            GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(runProps, Data.sents, seedWords, true, humanLabelClasses);

            ArgumentParser.FillOptions(model, runProps);
            GetPatternsFromDataMultiClass.LoadFromSavedPatternsWordsDir(model, runProps);
            IDictionary <string, int> alreadyLearnedIters = new Dictionary <string, int>();

            foreach (string label in model.constVars.GetLabels())
            {
                alreadyLearnedIters[label] = model.constVars.GetLearnedWordsEachIter()[label].LastEntry().Key;
            }
            if (model.constVars.learn)
            {
                //      Map<String, E> p0 = new HashMap<String, SurfacePattern>();
                //      Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>();
                //      Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>();
                model.IterateExtractApply(null, null, null);
            }
            IDictionary <string, ICounter <CandidatePhrase> > allExtractions = new Dictionary <string, ICounter <CandidatePhrase> >();
            //Only for one label right now!
            string label_1 = model.constVars.GetLabels().GetEnumerator().Current;

            allExtractions[label_1] = new ClassicCounter <CandidatePhrase>();
            foreach (KeyValuePair <string, DataInstance> sent in Data.sents)
            {
                StringBuilder str = new StringBuilder();
                foreach (CoreLabel l in sent.Value.GetTokens())
                {
                    if (l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null && !l.Get(typeof(PatternsAnnotations.MatchedPatterns)).IsEmpty())
                    {
                        str.Append(" " + l.Word());
                    }
                    else
                    {
                        allExtractions[label_1].IncrementCount(CandidatePhrase.CreateOrGet(str.ToString().Trim()));
                        str.Length = 0;
                    }
                }
            }
            allExtractions.PutAll(model.matchedSeedWords);
            return(model.constVars.GetSetWordsAsJson(allExtractions));
        }