//line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]}
        /// <exception cref="System.Exception"/>
        public virtual string DoNewPhrases(string line)
        {
            System.Console.Out.WriteLine("adding new phrases");
            ConstantsAndVariables constVars  = new ConstantsAndVariables(props, humanLabelClasses.Keys, humanLabelClasses);
            IJsonReader           jsonReader = Javax.Json.Json.CreateReader(new StringReader(line));
            IJsonObject           objarr     = jsonReader.ReadObject();

            foreach (KeyValuePair <string, IJsonValue> o in objarr)
            {
                string label = o.Key;
                ICollection <CandidatePhrase> seed = new HashSet <CandidatePhrase>();
                IJsonArray arr = objarr.GetJsonArray(o.Key);
                for (int i = 0; i < arr.Count; i++)
                {
                    string seedw = arr.GetString(i);
                    System.Console.Out.WriteLine("adding " + seedw + " to seed ");
                    seed.Add(CandidatePhrase.CreateOrGet(seedw));
                }
                Sharpen.Collections.AddAll(seedWords[label], seed);
                constVars.AddSeedWords(label, seed);
                GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seed, constVars, false);
            }
            //model.labelWords(label, labelclass, Data.sents, seed);
            return("SUCCESS added new phrases");
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.MissingMethodException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        public virtual string SuggestPhrases()
        {
            ResetPatternLabelsInSents(Data.sents);
            GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(props, Data.sents, seedWords, false, humanLabelClasses);

            //model.constVars.numIterationsForPatterns = 2;
            model.IterateExtractApply();
            return(model.constVars.GetLearnedWordsAsJson());
        }
 /// <exception cref="System.Exception"/>
 private static void DescendantsHelper(SemanticGraph g, IndexedWord curr, ICollection <IndexedWord> descendantSet, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, IList <IndexedWord> seenNodes, bool ignoreCommonTags, IPredicate <CoreLabel
                                                                                                                                                                                                                                                      > acceptWord, CollectionValuedMap <int, string> feat)
 {
     if (seenNodes.Contains(curr))
     {
         return;
     }
     seenNodes.Add(curr);
     if (descendantSet.Contains(curr) || (doNotAddThese != null && doNotAddThese.Contains(curr)) || !acceptWord.Test(curr.BackingLabel()))
     {
         return;
     }
     if (!ignoreCommonTags || !ignoreTags.Contains(curr.Tag().Trim()))
     {
         descendantSet.Add(curr);
     }
     foreach (IndexedWord child in g.GetChildren(curr))
     {
         bool dontuse = false;
         if (doNotAddThese != null && doNotAddThese.Contains(child))
         {
             dontuse = true;
         }
         GrammaticalRelation rel = null;
         if (dontuse == false)
         {
             rel     = g.Reln(curr, child);
             dontuse = CheckIfSatisfiesRelConstrains(g, curr, child, allCutOffRels, rel);
         }
         if (dontuse == false)
         {
             foreach (string cutOffTagRegex in cutoffTags)
             {
                 if (child.Tag().Matches(cutOffTagRegex))
                 {
                     if (Debug >= 5)
                     {
                         System.Console.Out.WriteLine("ignored tag " + child + " because it satisfied " + cutOffTagRegex);
                     }
                     dontuse = true;
                     break;
                 }
             }
         }
         if (dontuse == false)
         {
             if (!feat.Contains(curr.Index()))
             {
                 feat[curr.Index()] = new List <string>();
             }
             GetPatternsFromDataMultiClass.GetFeatures(g, curr, false, feat[curr.Index()], rel);
             //feat.add(curr.index(), "REL-" + rel.getShortName());
             DescendantsHelper(g, child, descendantSet, allCutOffRels, doNotAddThese, seenNodes, ignoreCommonTags, acceptWord, feat);
         }
     }
 }
        //the format of the line input is json string of maps. required keys are "input" and "seedWords". "input" can be a string or file (in which case readFile should be true.)
        // For example: {"input":"presidents.txt","seedWords":{"name":["Obama"],"place":["Chicago"]}}
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="Java.Sql.SQLException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.MissingMethodException"/>
        public virtual string ProcessText(bool writeOutputToFile)
        {
            logger.Info("Starting to process text");
            logger.Info("all seed words are " + seedWords);
            Pair <IDictionary <string, DataInstance>, IDictionary <string, DataInstance> > sentsPair = GetPatternsFromDataMultiClass.ProcessSents(props, seedWords.Keys);

            Data.sents = sentsPair.First();
            ConstantsAndVariables constVars = new ConstantsAndVariables(props, seedWords.Keys, machineAnswerClasses);

            foreach (string label in seedWords.Keys)
            {
                GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seedWords[label], constVars, true);
            }
            if (writeOutputToFile)
            {
                GetPatternsFromDataMultiClass.WriteColumnOutput(outputFile, false, humanLabelClasses);
                System.Console.Out.WriteLine("written the output to " + outputFile);
            }
            logger.Info("Finished processing text");
            return("SUCCESS");
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public override ICounter <E> Score()
        {
            ICounter <CandidatePhrase> externalWordWeightsNormalized = null;

            if (constVars.dictOddsWeights.Contains(label))
            {
                externalWordWeightsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false);
            }
            ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>();
            bool         useFreqPhraseExtractedByPat = false;

            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio))
            {
                useFreqPhraseExtractedByPat = true;
            }
            IToDoubleFunction <Pair <E, CandidatePhrase> > numeratorScore = null;
            ICounter <E> numeratorPatWt   = this.Convert2OneDim(label, numeratorScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, null, useFreqPhraseExtractedByPat);
            ICounter <E> denominatorPatWt = null;
            IToDoubleFunction <Pair <E, CandidatePhrase> > denoScore;

            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegUnlabOdds))
            {
                denoScore        = null;
                denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
            }
            else
            {
                if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RatioAll))
                {
                    denoScore        = null;
                    denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                }
                else
                {
                    if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegOdds))
                    {
                        denoScore        = null;
                        denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                    }
                    else
                    {
                        if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.
                                                                                                                                                                                                                            Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP))
                        {
                            denoScore        = null;
                            denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, true, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                        }
                        else
                        {
                            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio))
                            {
                                denoScore        = null;
                                denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, true, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat);
                            }
                            else
                            {
                                throw new Exception("Cannot understand patterns scoring");
                            }
                        }
                    }
                }
            }
            currentPatternWeights4Label = Counters.DivisionNonNaN(numeratorPatWt, denominatorPatWt);
            //Multiplying by logP
            if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP))
            {
                ICounter <E> logpos_i = new ClassicCounter <E>();
                foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet())
                {
                    logpos_i.SetCount(en.Key, Math.Log(en.Value.Size()));
                }
                Counters.MultiplyInPlace(currentPatternWeights4Label, logpos_i);
            }
            Counters.RetainNonZeros(currentPatternWeights4Label);
            return(currentPatternWeights4Label);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool
                                                     sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat)
        {
            //    if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) {
            //      Data.loadGoogleNGrams();
            //    }
            ICounter <E> patterns = new ClassicCounter <E>();
            ICounter <CandidatePhrase> googleNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores    = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>();
            double            externalWtsDefault = 0.5;
            ICounter <string> classifierScores   = null;

            if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
            {
                foreach (CandidatePhrase gc in allCandidatePhrases)
                {
                    string g = gc.GetPhrase();
                    if (constVars.usePatternEvalEditDistOther)
                    {
                        editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalEditDistSame)
                    {
                        editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                    }
                    if (constVars.usePatternEvalGoogleNgram)
                    {
                        googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc));
                    }
                    if (constVars.usePatternEvalDomainNgram)
                    {
                        // calculate domain-ngram wts
                        if (Data.domainNGramRawFreq.ContainsKey(g))
                        {
                            System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                            domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g));
                        }
                    }
                    if (constVars.usePatternEvalWordClass)
                    {
                        int num = constVars.GetWordClassClusters()[g];
                        if (num == null)
                        {
                            num = constVars.GetWordClassClusters()[g.ToLower()];
                        }
                        if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                        {
                            externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                        }
                        else
                        {
                            externalFeatWtsNormalized.SetCount(gc, externalWtsDefault);
                        }
                    }
                }
                if (constVars.usePatternEvalGoogleNgram)
                {
                    googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalDomainNgram)
                {
                    domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
                }
                if (constVars.usePatternEvalWordClass)
                {
                    externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
                }
            }
            else
            {
                if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                {
                    Properties props2 = new Properties();
                    props2.PutAll(props);
                    props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt");
                    ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars);
                    System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile"));
                    ArgumentParser.FillOptions(typeof(Data), props2);
                    classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true);
                }
            }
            ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet())
            {
                foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet())
                {
                    CandidatePhrase word = en2.Key;
                    ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                    double score = 1;
                    if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection)
                    {
                        if (cachedScoresForThisIter.ContainsKey(word))
                        {
                            score = cachedScoresForThisIter.GetCount(word);
                        }
                        else
                        {
                            if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word))
                            {
                                score = 1;
                            }
                            else
                            {
                                if (constVars.usePatternEvalSemanticOdds)
                                {
                                    double semanticClassOdds = 1;
                                    if (dictOddsWordWeights.ContainsKey(word))
                                    {
                                        semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds);
                                }
                                if (constVars.usePatternEvalGoogleNgram)
                                {
                                    double gscore = 0;
                                    if (googleNgramNormScores.ContainsKey(word))
                                    {
                                        gscore = 1 - googleNgramNormScores.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore);
                                }
                                if (constVars.usePatternEvalDomainNgram)
                                {
                                    double domainscore;
                                    if (domainNgramNormScores.ContainsKey(word))
                                    {
                                        domainscore = 1 - domainNgramNormScores.GetCount(word);
                                    }
                                    else
                                    {
                                        domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                                }
                                if (constVars.usePatternEvalWordClass)
                                {
                                    double externalFeatureWt = externalWtsDefault;
                                    if (externalFeatWtsNormalized.ContainsKey(word))
                                    {
                                        externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word);
                                    }
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                                }
                                if (constVars.usePatternEvalEditDistOther)
                                {
                                    System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty);
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word));
                                }
                                if (constVars.usePatternEvalEditDistSame)
                                {
                                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word));
                                }
                                // taking average
                                score = Counters.Mean(scoreslist);
                                phInPatScores.SetCounter(word, scoreslist);
                            }
                            cachedScoresForThisIter.SetCount(word, score);
                        }
                    }
                    else
                    {
                        if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection)
                        {
                            score = 1 - classifierScores.GetCount(word);
                        }
                    }
                    // score = 1 - scorePhrases.scoreUsingClassifer(classifier,
                    // e.getKey(), label, true, null, null, dictOddsWordWeights);
                    // throw new RuntimeException("not implemented yet");
                    if (useFreqPhraseExtractedByPat)
                    {
                        score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word));
                    }
                    if (constVars.sqrtPatScore)
                    {
                        patterns.IncrementCount(en.Key, Math.Sqrt(score));
                    }
                    else
                    {
                        patterns.IncrementCount(en.Key, score);
                    }
                }
            }
            return(patterns);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        internal virtual void SetUpProperties(string line, bool readFile, bool writeOutputToFile, string additionalSeedWordsFiles)
        {
            IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line));
            IJsonObject objarr     = jsonReader.ReadObject();

            jsonReader.Close();
            Properties props = new Properties();

            foreach (string o in objarr.Keys)
            {
                if (o.Equals("seedWords"))
                {
                    IJsonObject obj = objarr.GetJsonObject(o);
                    foreach (string st in obj.Keys)
                    {
                        seedWords[st] = new HashSet <CandidatePhrase>();
                        IJsonArray arr = obj.GetJsonArray(st);
                        for (int i = 0; i < arr.Count; i++)
                        {
                            string val = arr.GetString(i);
                            seedWords[st].Add(CandidatePhrase.CreateOrGet(val));
                            System.Console.Out.WriteLine("adding " + val + " for label " + st);
                        }
                    }
                }
                else
                {
                    props.SetProperty(o, objarr.GetString(o));
                }
            }
            System.Console.Out.WriteLine("seedwords are " + seedWords);
            if (additionalSeedWordsFiles != null && !additionalSeedWordsFiles.IsEmpty())
            {
                IDictionary <string, ICollection <CandidatePhrase> > additionalSeedWords = GetPatternsFromDataMultiClass.ReadSeedWords(additionalSeedWordsFiles);
                logger.Info("additional seed words are " + additionalSeedWords);
                foreach (string label in seedWords.Keys)
                {
                    if (additionalSeedWords.Contains(label))
                    {
                        Sharpen.Collections.AddAll(seedWords[label], additionalSeedWords[label]);
                    }
                }
            }
            outputFile = null;
            if (readFile)
            {
                System.Console.Out.WriteLine("input value is " + objarr.GetString("input"));
                outputFile = props.GetProperty("input") + "_processed";
                props.SetProperty("file", objarr.GetString("input"));
                if (writeOutputToFile && !props.Contains("columnOutputFile"))
                {
                    props.SetProperty("columnOutputFile", outputFile);
                }
            }
            else
            {
                string systemdir = Runtime.GetProperty("java.io.tmpdir");
                File   tempFile  = File.CreateTempFile("sents", ".tmp", new File(systemdir));
                tempFile.DeleteOnExit();
                IOUtils.WriteStringToFile(props.GetProperty("input"), tempFile.GetPath(), "utf8");
                props.SetProperty("file", tempFile.GetAbsolutePath());
            }
            SetProperties(props);
            this.props = props;
            int i_1 = 1;

            foreach (string label_1 in seedWords.Keys)
            {
                string ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i_1;
                Type   mcCl     = (Type)Sharpen.Runtime.GetType(ansclstr);
                machineAnswerClasses[label_1] = mcCl;
                string humanansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternHumanLabel" + i_1;
                humanLabelClasses[label_1] = (Type)Sharpen.Runtime.GetType(humanansclstr);
                i_1++;
            }
        }
 public virtual string GetMatchedTokensByPhrase(string input)
 {
     return(GetPatternsFromDataMultiClass.MatchedTokensByPhraseJsonString(input));
 }
 public virtual string GetMatchedTokensByAllPhrases()
 {
     return(GetPatternsFromDataMultiClass.MatchedTokensByPhraseJsonString());
 }
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.MissingMethodException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="Java.Sql.SQLException"/>
        public virtual string SuggestPhrasesTest(Properties testProps, string modelPropertiesFile, string stopWordsFile)
        {
            logger.Info("Suggesting phrases in test");
            logger.Info("test properties are " + testProps);
            Properties runProps = StringUtils.ArgsToPropertiesWithResolve(new string[] { "-props", modelPropertiesFile });

            string[] removeProperties = new string[] { "allPatternsDir", "storePatsForEachToken", "invertedIndexClass", "savePatternsWordsDir", "batchProcessSents", "outDir", "saveInvertedIndex", "removeOverLappingLabels", "numThreads" };
            foreach (string s in removeProperties)
            {
                if (runProps.Contains(s))
                {
                    runProps.Remove(s);
                }
            }
            runProps.SetProperty("stopWordsPatternFiles", stopWordsFile);
            runProps.SetProperty("englishWordsFiles", stopWordsFile);
            runProps.SetProperty("commonWordsPatternFiles", stopWordsFile);
            runProps.PutAll(props);
            runProps.PutAll(testProps);
            props.PutAll(runProps);
            ProcessText(false);
            GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(runProps, Data.sents, seedWords, true, humanLabelClasses);

            ArgumentParser.FillOptions(model, runProps);
            GetPatternsFromDataMultiClass.LoadFromSavedPatternsWordsDir(model, runProps);
            IDictionary <string, int> alreadyLearnedIters = new Dictionary <string, int>();

            foreach (string label in model.constVars.GetLabels())
            {
                alreadyLearnedIters[label] = model.constVars.GetLearnedWordsEachIter()[label].LastEntry().Key;
            }
            if (model.constVars.learn)
            {
                //      Map<String, E> p0 = new HashMap<String, SurfacePattern>();
                //      Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>();
                //      Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>();
                model.IterateExtractApply(null, null, null);
            }
            IDictionary <string, ICounter <CandidatePhrase> > allExtractions = new Dictionary <string, ICounter <CandidatePhrase> >();
            //Only for one label right now!
            string label_1 = model.constVars.GetLabels().GetEnumerator().Current;

            allExtractions[label_1] = new ClassicCounter <CandidatePhrase>();
            foreach (KeyValuePair <string, DataInstance> sent in Data.sents)
            {
                StringBuilder str = new StringBuilder();
                foreach (CoreLabel l in sent.Value.GetTokens())
                {
                    if (l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null && !l.Get(typeof(PatternsAnnotations.MatchedPatterns)).IsEmpty())
                    {
                        str.Append(" " + l.Word());
                    }
                    else
                    {
                        allExtractions[label_1].IncrementCount(CandidatePhrase.CreateOrGet(str.ToString().Trim()));
                        str.Length = 0;
                    }
                }
            }
            allExtractions.PutAll(model.matchedSeedWords);
            return(model.constVars.GetSetWordsAsJson(allExtractions));
        }
        internal override ICounter <CandidatePhrase> ScorePhrases(string label, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase
                                                                                                                                                                                                                                               > alreadyIdentifiedWords, bool forLearningPatterns)
        {
            IDictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > scores = new Dictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> >();

            if (Data.domainNGramsFile != null)
            {
                Data.LoadDomainNGrams();
            }
            Redwood.Log(ConstantsAndVariables.extremedebug, "Considering terms: " + terms.FirstKeySet());
            // calculate TF-IDF like scores
            ICounter <CandidatePhrase> tfidfScores = new ClassicCounter <CandidatePhrase>();

            if (constVars.usePhraseEvalPatWtByFreq)
            {
                foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                {
                    double score = GetPatTFIDFScore(en.Key, en.Value, allSelectedPatterns);
                    tfidfScores.SetCount(en.Key, score);
                }
                Redwood.Log(ConstantsAndVariables.extremedebug, "BEFORE IDF " + Counters.ToSortedString(tfidfScores, 100, "%1$s:%2$f", "\t"));
                Counters.DivideInPlace(tfidfScores, Data.processedDataFreq);
            }
            ICounter <CandidatePhrase> externalFeatWtsNormalized     = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> domainNgramNormScores         = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> googleNgramNormScores         = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceOtherBinaryScores = new ClassicCounter <CandidatePhrase>();
            ICounter <CandidatePhrase> editDistanceSameBinaryScores  = new ClassicCounter <CandidatePhrase>();

            foreach (CandidatePhrase gc in terms.FirstKeySet())
            {
                string g = gc.GetPhrase();
                if (constVars.usePhraseEvalEditDistOther)
                {
                    editDistanceOtherBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresOtherClassThreshold(label, g));
                }
                if (constVars.usePhraseEvalEditDistSame)
                {
                    editDistanceSameBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresThisClassThreshold(label, g));
                }
                if (constVars.usePhraseEvalDomainNgram)
                {
                    // calculate domain-ngram wts
                    if (Data.domainNGramRawFreq.ContainsKey(g))
                    {
                        System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc)));
                        domainNgramNormScores.SetCount(gc, GetDomainNgramScore(g));
                    }
                    else
                    {
                        log.Info("why is " + g + " not present in domainNgram");
                    }
                }
                if (constVars.usePhraseEvalGoogleNgram)
                {
                    googleNgramNormScores.SetCount(gc, GetGoogleNgramScore(gc));
                }
                if (constVars.usePhraseEvalWordClass)
                {
                    // calculate dist sim weights
                    int num = constVars.GetWordClassClusters()[g];
                    if (num == null)
                    {
                        num = constVars.GetWordClassClusters()[g.ToLower()];
                    }
                    if (num != null && constVars.distSimWeights[label].ContainsKey(num))
                    {
                        externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num));
                    }
                    else
                    {
                        externalFeatWtsNormalized.SetCount(gc, OOVExternalFeatWt);
                    }
                }
            }
            ICounter <CandidatePhrase> normTFIDFScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(tfidfScores, true, true, false);
            ICounter <CandidatePhrase> dictOdddsScores = null;

            if (constVars.usePhraseEvalSemanticOdds)
            {
                System.Diagnostics.Debug.Assert(constVars.dictOddsWeights != null, "usePhraseEvalSemanticOdds is true but dictOddsWeights is null for the label " + label);
                dictOdddsScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false);
            }
            domainNgramNormScores     = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false);
            googleNgramNormScores     = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false);
            externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false);
            // Counters.max(googleNgramNormScores);
            // Counters.max(externalFeatWtsNormalized);
            foreach (CandidatePhrase word in terms.FirstKeySet())
            {
                if (alreadyIdentifiedWords.Contains(word))
                {
                    continue;
                }
                ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>();
                System.Diagnostics.Debug.Assert(normTFIDFScores.ContainsKey(word), "NormTFIDF score does not contain" + word);
                double tfscore = normTFIDFScores.GetCount(word);
                scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Patwtbyfreq, tfscore);
                if (constVars.usePhraseEvalSemanticOdds)
                {
                    double dscore;
                    if (dictOdddsScores.ContainsKey(word))
                    {
                        dscore = dictOdddsScores.GetCount(word);
                    }
                    else
                    {
                        dscore = GetPhraseWeightFromWords(dictOdddsScores, word, OOVdictOdds);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, dscore);
                }
                if (constVars.usePhraseEvalDomainNgram)
                {
                    double domainscore;
                    if (domainNgramNormScores.ContainsKey(word))
                    {
                        domainscore = domainNgramNormScores.GetCount(word);
                    }
                    else
                    {
                        domainscore = GetPhraseWeightFromWords(domainNgramNormScores, word, OOVDomainNgramScore);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore);
                }
                if (constVars.usePhraseEvalGoogleNgram)
                {
                    double googlescore;
                    if (googleNgramNormScores.ContainsKey(word))
                    {
                        googlescore = googleNgramNormScores.GetCount(word);
                    }
                    else
                    {
                        googlescore = GetPhraseWeightFromWords(googleNgramNormScores, word, OOVGoogleNgramScore);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, googlescore);
                }
                if (constVars.usePhraseEvalWordClass)
                {
                    double externalFeatureWt;
                    if (externalFeatWtsNormalized.ContainsKey(word))
                    {
                        externalFeatureWt = externalFeatWtsNormalized.GetCount(word);
                    }
                    else
                    {
                        externalFeatureWt = GetPhraseWeightFromWords(externalFeatWtsNormalized, word, OOVExternalFeatWt);
                    }
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt);
                }
                if (constVars.usePhraseEvalEditDistOther)
                {
                    System.Diagnostics.Debug.Assert(editDistanceOtherBinaryScores.ContainsKey(word), "How come no edit distance info?");
                    double editD = editDistanceOtherBinaryScores.GetCount(word);
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editD);
                }
                if (constVars.usePhraseEvalEditDistSame)
                {
                    double editDSame = editDistanceSameBinaryScores.GetCount(word);
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDSame);
                }
                if (constVars.usePhraseEvalWordShape)
                {
                    scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Wordshape, this.GetWordShapeScore(word.GetPhrase(), label));
                }
                scores[word] = scoreslist;
                phraseScoresNormalized.SetCounter(word, scoreslist);
            }
            ICounter <CandidatePhrase> phraseScores = new ClassicCounter <CandidatePhrase>();

            foreach (KeyValuePair <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > wEn in scores)
            {
                double avgScore = Counters.Mean(wEn.Value);
                if (!avgScore.IsInfinite() && !double.IsNaN(avgScore))
                {
                    phraseScores.SetCount(wEn.Key, avgScore);
                }
                else
                {
                    Redwood.Log(Redwood.Dbg, "Ignoring " + wEn.Key + " because score is " + avgScore);
                }
            }
            return(phraseScores);
        }
Exemplo n.º 12
0
        /// <summary>creates all patterns and saves them in the correct PatternsForEachToken* class appropriately</summary>
        /// <param name="sents"/>
        /// <param name="props"/>
        /// <param name="storePatsForEachTokenWay"/>
        public virtual void GetAllPatterns(IDictionary <string, DataInstance> sents, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachTokenWay)
        {
            //    this.patternsForEachToken = new HashMap<String, Map<Integer, Triple<Set<Integer>, Set<Integer>, Set<Integer>>>>();
            // this.patternsForEachToken = new HashMap<String, Map<Integer, Set<Integer>>>();
            DateTime       startDate = new DateTime();
            IList <string> keyset    = new List <string>(sents.Keys);
            int            num;

            if (constVars.numThreads == 1)
            {
                num = keyset.Count;
            }
            else
            {
                num = keyset.Count / (constVars.numThreads);
            }
            IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads);

            Redwood.Log(ConstantsAndVariables.extremedebug, "Computing all patterns. keyset size is " + keyset.Count + ". Assigning " + num + " values to each thread");
            IList <IFuture <bool> > list = new List <IFuture <bool> >();

            for (int i = 0; i < constVars.numThreads; i++)
            {
                int from = i * num;
                int to   = -1;
                if (i == constVars.numThreads - 1)
                {
                    to = keyset.Count;
                }
                else
                {
                    to = Math.Min(keyset.Count, (i + 1) * num);
                }
                //
                //      Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num
                //          + " till " + Math.min(keyset.size(), (i + 1) * num));
                IList <string>   ids    = keyset.SubList(from, to);
                ICallable <bool> task   = new CreatePatterns.CreatePatternsThread(this, sents, ids, props, storePatsForEachTokenWay);
                IFuture <bool>   submit = executor.Submit(task);
                list.Add(submit);
            }
            // Now retrieve the result
            foreach (IFuture <bool> future in list)
            {
                try
                {
                    future.Get();
                }
                catch (Exception e)
                {
                    //patternsForEachToken.putAll(future.get());
                    executor.ShutdownNow();
                    throw new Exception(e);
                }
            }
            executor.Shutdown();
            DateTime endDate   = new DateTime();
            string   timeTaken = GetPatternsFromDataMultiClass.ElapsedTime(startDate, endDate);

            Redwood.Log(Redwood.Dbg, "Done computing all patterns [" + timeTaken + "]");
        }
 //Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex
 public virtual void PrintSubGraph(SemanticGraph g, IndexedWord w, IList <string> additionalCutOffRels, IList <string> textTokens, ICollection <string> listOfOutput, ICollection <IntPair> listOfOutputIndices, IList <IndexedWord> seenNodes, IList <IndexedWord
                                                                                                                                                                                                                                                       > doNotAddThese, bool findSubTrees, ICollection <ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, IPredicate <CoreLabel> acceptWord)
 {
     try
     {
         if (seenNodes.Contains(w))
         {
             return;
         }
         seenNodes.Add(w);
         if (doNotAddThese.Contains(w))
         {
             return;
         }
         IList <IndexedWord> andNodes = new List <IndexedWord>();
         DescendantsWithReln(g, w, "conj_and", new List <IndexedWord>(), andNodes);
         //System.out.println("and nodes are " + andNodes);
         foreach (IndexedWord w1 in andNodes)
         {
             PrintSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
         }
         Sharpen.Collections.AddAll(doNotAddThese, andNodes);
         IList <string> allCutOffRels = new List <string>();
         if (additionalCutOffRels != null)
         {
             Sharpen.Collections.AddAll(allCutOffRels, additionalCutOffRels);
         }
         Sharpen.Collections.AddAll(allCutOffRels, cutoffRelations);
         CollectionValuedMap <int, string> featPerToken = new CollectionValuedMap <int, string>();
         ICollection <string> feat = new List <string>();
         GetPatternsFromDataMultiClass.GetFeatures(g, w, true, feat, null);
         ICollection <IndexedWord> words = Descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken);
         // words.addAll(andNodes);
         // if (includeSiblings == true) {
         // for (IndexedWord ws : g.getSiblings(w)) {
         // if (additionalCutOffNodes == null
         // || !additionalCutOffNodes.contains(g.reln(g.getParent(w),
         // ws).getShortName()))
         // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese));
         // }
         // }
         // if(afterand != null){
         // Set<IndexedWord> wordsAnd = descendants(g,afterand,
         // additionalCutOffNodes);
         // words.removeAll(wordsAnd);
         // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes);
         // }
         //System.out.println("words are " + words);
         if (words.Count > 0)
         {
             int min = int.MaxValue;
             int max = -1;
             foreach (IndexedWord word in words)
             {
                 if (word.Index() < min)
                 {
                     min = word.Index();
                 }
                 if (word.Index() > max)
                 {
                     max = word.Index();
                 }
             }
             IntPair indices;
             // Map<Integer, String> ph = new TreeMap<Integer, String>();
             // String phrase = "";
             // for (IndexedWord word : words) {
             // ph.put(word.index(), word.value());
             // }
             // phrase = StringUtils.join(ph.values(), " ");
             if ((max - min + 1) > maxPhraseLength)
             {
                 max = min + maxPhraseLength - 1;
             }
             indices = new IntPair(min - 1, max - 1);
             string phrase = StringUtils.Join(textTokens.SubList(min - 1, max), " ");
             phrase = phrase.Trim();
             feat.Add("LENGTH-" + (max - min + 1));
             for (int i = min; i <= max; i++)
             {
                 Sharpen.Collections.AddAll(feat, featPerToken[i]);
             }
             //System.out.println("phrase is " + phrase  + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words);
             ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.AsCounter(feat));
             if (!listOfOutput.Contains(phrase) && !doNotAddThese.Contains(phrase))
             {
                 //          if (sentElem != null) {
                 //            Element node = new Element(elemString, curNS);
                 //            node.addContent(phrase);
                 //            sentElem.addContent(node);
                 //          }
                 listOfOutput.Add(phrase);
                 if (!listOfOutputIndices.Contains(indices))
                 {
                     listOfOutputIndices.Add(indices);
                     extractedPhrases.Add(extractedPh);
                 }
                 if (findSubTrees == true)
                 {
                     foreach (IndexedWord word_1 in words)
                     {
                         if (!seenNodes.Contains(word_1))
                         {
                             PrintSubGraph(g, word_1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord);
                         }
                     }
                 }
             }
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }