コード例 #1
0
        //line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]}
        /// <exception cref="System.Exception"/>
        public virtual string DoNewPhrases(string line)
        {
            System.Console.Out.WriteLine("adding new phrases");
            ConstantsAndVariables constVars  = new ConstantsAndVariables(props, humanLabelClasses.Keys, humanLabelClasses);
            IJsonReader           jsonReader = Javax.Json.Json.CreateReader(new StringReader(line));
            IJsonObject           objarr     = jsonReader.ReadObject();

            foreach (KeyValuePair <string, IJsonValue> o in objarr)
            {
                string label = o.Key;
                ICollection <CandidatePhrase> seed = new HashSet <CandidatePhrase>();
                IJsonArray arr = objarr.GetJsonArray(o.Key);
                for (int i = 0; i < arr.Count; i++)
                {
                    string seedw = arr.GetString(i);
                    System.Console.Out.WriteLine("adding " + seedw + " to seed ");
                    seed.Add(CandidatePhrase.CreateOrGet(seedw));
                }
                Sharpen.Collections.AddAll(seedWords[label], seed);
                constVars.AddSeedWords(label, seed);
                GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seed, constVars, false);
            }
            //model.labelWords(label, labelclass, Data.sents, seed);
            return("SUCCESS added new phrases");
        }
コード例 #2
0
 public ScorePatternsF1(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                        , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, Properties props, ICounter <CandidatePhrase> p0Set, E p0)
     : base(constVars, patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props)
 {
     this.p0    = p0;
     this.p0Set = p0Set;
 }
コード例 #3
0
 public ScorePatternsRatioModifiedFreq(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                                       , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, TwoDimensionalCounter <CandidatePhrase, ConstantsAndVariables.ScorePhraseMeasures> phInPatScores
                                       , ScorePhrases scorePhrases, Properties props)
     : base(constVars, patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props)
 {
     this.phInPatScores = phInPatScores;
     this.scorePhrases  = scorePhrases;
 }
コード例 #4
0
 /// <exception cref="System.IO.IOException"/>
 public CreatePatterns(Properties props, ConstantsAndVariables constVars)
 {
     //String channelNameLogger = "createpatterns";
     this.constVars = constVars;
     ArgumentParser.FillOptions(typeof(ConstantsAndVariables), props);
     constVars.SetUp(props);
     SetUp(props);
 }
コード例 #5
0
        public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords
                                                                 , double thresholdWordExtract)
        {
            IEnumerator <CandidatePhrase> termIter   = Counters.ToPriorityQueue(newdt).GetEnumerator();
            ICounter <CandidatePhrase>    finalwords = new ClassicCounter <CandidatePhrase>();

            while (termIter.MoveNext())
            {
                if (finalwords.Size() >= constVars.numWordsToAdd)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                if (newdt.GetCount(w) < thresholdWordExtract)
                {
                    Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of  " + thresholdWordExtract);
                    break;
                }
                System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity));
                if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied)
                {
                    Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet());
                    continue;
                }
                CandidatePhrase matchedFuzzy = null;
                if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
                {
                    matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
                }
                if (matchedFuzzy == null)
                {
                    Redwood.Log("extremePatDebug", "adding word " + w);
                    finalwords.SetCount(w, newdt.GetCount(w));
                }
                else
                {
                    Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word");
                    ignoreWords.Add(w);
                }
            }
            string nextTen = string.Empty;
            int    n       = 0;

            while (termIter.MoveNext())
            {
                n++;
                if (n > 10)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                nextTen += ";\t" + w + ":" + newdt.GetCount(w);
            }
            Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen);
            return(finalwords);
        }
コード例 #6
0
        internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj)
        {
            Token  strgeneric  = new Token(PatternFactory.PatternType.Surface);
            string strOriginal = string.Empty;
            bool   isLabeledO  = true;

            //    for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
            //      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
            //        isLabeledO = false;
            //        if (strOriginal.isEmpty()) {
            //          strOriginal = e.getKey();
            //        } else {
            //          strOriginal += "|" + e.getKey();
            //        }
            //        strgeneric.addRestriction(e.getKey(), e.getKey());
            //      }
            //    }
            foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses())
            {
                if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null)
                {
                    throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All));
                }
                if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = e.Key;
                    }
                    else
                    {
                        strOriginal += "|" + e.Key;
                    }
                    strgeneric.AddORRestriction(e.Value, e.Key);
                }
            }
            if (useContextNERRestriction)
            {
                string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = nerTag;
                    }
                    else
                    {
                        strOriginal += "|" + nerTag;
                    }
                    strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag);
                }
            }
            return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal));
        }
コード例 #7
0
 public ApplyPatterns(IDictionary <string, DataInstance> sents, IList <string> sentids, IDictionary <TokenSequencePattern, E> patterns, string label, bool removeStopWordsFromSelectedPhrases, bool removePhrasesWithStopWords, ConstantsAndVariables
                      cv)
 {
     this.sents    = sents;
     this.patterns = patterns;
     this.sentids  = sentids;
     this.label    = label;
     this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
     this.removePhrasesWithStopWords         = removePhrasesWithStopWords;
     this.constVars = cv;
 }
コード例 #8
0
 public ApplyDepPatterns(IDictionary <string, DataInstance> sents, IList <string> sentids, IDictionary <SemgrexPattern, E> patterns, string label, bool removeStopWordsFromSelectedPhrases, bool removePhrasesWithStopWords, ConstantsAndVariables cv
                         )
 {
     matchingWordRestriction = new _IPredicate_183(this);
     // = null;
     this.sents    = sents;
     this.patterns = patterns;
     this.sentids  = sentids;
     this.label    = label;
     this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
     this.removePhrasesWithStopWords         = removePhrasesWithStopWords;
     this.constVars = cv;
 }
コード例 #9
0
 public ApplyPatternsMulti(IDictionary <string, DataInstance> sents, IList <string> sentids, IDictionary <TokenSequencePattern, E> patterns, string label, bool removeStopWordsFromSelectedPhrases, bool removePhrasesWithStopWords, ConstantsAndVariables
                           cv)
 {
     //Set<String> ignoreWords;
     this.sents          = sents;
     this.patterns       = patterns;
     multiPatternMatcher = TokenSequencePattern.GetMultiPatternMatcher(patterns.Keys);
     this.sentids        = sentids;
     this.label          = label;
     this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
     this.removePhrasesWithStopWords         = removePhrasesWithStopWords;
     this.constVars = cv;
 }
コード例 #10
0
 public ScorePhrases(Properties props, ConstantsAndVariables cv)
 {
     ArgumentParser.FillOptions(this, props);
     this.constVars = cv;
     try
     {
         phraseScorer = phraseScorerClass.GetConstructor(typeof(ConstantsAndVariables)).NewInstance(constVars);
     }
     catch (ReflectiveOperationException e)
     {
         throw new Exception(e);
     }
     ArgumentParser.FillOptions(phraseScorer, props);
 }
コード例 #11
0
 public ScorePatterns(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                      , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, Properties props)
 {
     // protected TwoDimensionalCounter<SurfacePattern, String>
     // posnegPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern,
     // String>();
     //protected TwoDimensionalCounter<E, String> negandUnLabeledPatternsandWords4Label = new TwoDimensionalCounter<E, String>();
     //protected TwoDimensionalCounter<E, String> allPatternsandWords4Label = new TwoDimensionalCounter<E, String>();
     this.constVars                       = constVars;
     this.patternScoring                  = patternScoring;
     this.label                           = label;
     this.allCandidatePhrases             = allCandidatePhrases;
     this.patternsandWords4Label          = patternsandWords4Label;
     this.negPatternsandWords4Label       = negPatternsandWords4Label;
     this.unLabeledPatternsandWords4Label = unLabeledPatternsandWords4Label;
     this.props                           = props;
 }
コード例 #12
0
        //the format of the line input is json string of maps. required keys are "input" and "seedWords". "input" can be a string or file (in which case readFile should be true.)
        // For example: {"input":"presidents.txt","seedWords":{"name":["Obama"],"place":["Chicago"]}}
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="Java.Sql.SQLException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.MissingMethodException"/>
        public virtual string ProcessText(bool writeOutputToFile)
        {
            logger.Info("Starting to process text");
            logger.Info("all seed words are " + seedWords);
            Pair <IDictionary <string, DataInstance>, IDictionary <string, DataInstance> > sentsPair = GetPatternsFromDataMultiClass.ProcessSents(props, seedWords.Keys);

            Data.sents = sentsPair.First();
            ConstantsAndVariables constVars = new ConstantsAndVariables(props, seedWords.Keys, machineAnswerClasses);

            foreach (string label in seedWords.Keys)
            {
                GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seedWords[label], constVars, true);
            }
            if (writeOutputToFile)
            {
                GetPatternsFromDataMultiClass.WriteColumnOutput(outputFile, false, humanLabelClasses);
                System.Console.Out.WriteLine("written the output to " + outputFile);
            }
            logger.Info("Finished processing text");
            return("SUCCESS");
        }
コード例 #13
0
        public virtual void Test()
        {
            Properties props = new Properties();

            props.SetProperty("patternType", "DEP");
            ConstantsAndVariables              constvars      = new ConstantsAndVariables(props, new HashSet <string>(), new Dictionary <string, Type>());
            CreatePatterns <DepPattern>        createPatterns = new CreatePatterns <DepPattern>(props, constvars);
            IDictionary <string, DataInstance> sents          = new Dictionary <string, DataInstance>();
            ICoreMap      m           = new ArrayCoreMap();
            string        text        = "We present a paper that focuses on semantic graphs applied to language.";
            string        graphString = "[present/VBP-2 nsubj>We/PRP-1 dobj>[paper/NN-4 det>a/DT-3] ccomp>[applied/VBN-10 mark>that/IN-5 nsubj>[focuses/NN-6 nmod:on>[graphs/NNS-9 amod>semantic/JJ-8]] nmod:to>language/NN-12]]";
            SemanticGraph graph       = SemanticGraph.ValueOf(graphString);
            //String phrase = "semantic graphs";
            IList <string> tokens = Arrays.AsList(new string[] { "We", "present", "a", "paper", "that", "focuses", "on", "semantic", "graphs", "applied", "to", "language" });

            m.Set(typeof(CoreAnnotations.TokensAnnotation), tokens.Stream().Map(null).Collect(Collectors.ToList()));
            m.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            sents["sent1"] = DataInstance.GetNewInstance(PatternFactory.PatternType.Dep, m);
            createPatterns.GetAllPatterns(sents, props, ConstantsAndVariables.PatternForEachTokenWay.Memory);
            System.Console.Out.WriteLine("graph is " + graph);
            System.Console.Out.WriteLine(PatternsForEachTokenInMemory.patternsForEachToken);
        }
コード例 #14
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
コード例 #15
0
 public PhraseScorer(ConstantsAndVariables constvar)
 {
     //these get overwritten in ScorePhrasesLearnFeatWt class
     this.constVars = constvar;
 }
コード例 #16
0
            /// <exception cref="System.Exception"/>
            public virtual bool Call()
            {
                IDictionary <string, IDictionary <int, ICollection <E> > > tempPatternsForTokens = new Dictionary <string, IDictionary <int, ICollection <E> > >();
                int numSentencesInOneCommit = 0;

                foreach (string id in this.sentIds)
                {
                    DataInstance sent = this.sents[id];
                    if (!this._enclosing.constVars.storePatsForEachToken.Equals(ConstantsAndVariables.PatternForEachTokenWay.Memory))
                    {
                        tempPatternsForTokens[id] = new Dictionary <int, ICollection <E> >();
                    }
                    IDictionary <int, ICollection <E> > p = (IDictionary)PatternFactory.GetPatternsAroundTokens(this._enclosing.constVars.patternType, sent, ConstantsAndVariables.GetStopWords());
                    //to save number of commits to the database
                    if (!this._enclosing.constVars.storePatsForEachToken.Equals(ConstantsAndVariables.PatternForEachTokenWay.Memory))
                    {
                        tempPatternsForTokens[id] = p;
                        numSentencesInOneCommit++;
                        if (numSentencesInOneCommit % 1000 == 0)
                        {
                            this.patsForEach.AddPatterns(tempPatternsForTokens);
                            tempPatternsForTokens.Clear();
                            numSentencesInOneCommit = 0;
                        }
                    }
                    else
                    {
                        //          patsForEach.addPatterns(id, p);
                        this.patsForEach.AddPatterns(id, p);
                    }
                }
                //For the remaining sentences
                if (!this._enclosing.constVars.storePatsForEachToken.Equals(ConstantsAndVariables.PatternForEachTokenWay.Memory))
                {
                    this.patsForEach.AddPatterns(tempPatternsForTokens);
                }
                return(true);
            }
コード例 #17
0
 public ScorePhrasesAverageFeatures(ConstantsAndVariables constvar)
     : base(constvar)
 {
 }