//line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]} /// <exception cref="System.Exception"/> public virtual string DoNewPhrases(string line) { System.Console.Out.WriteLine("adding new phrases"); ConstantsAndVariables constVars = new ConstantsAndVariables(props, humanLabelClasses.Keys, humanLabelClasses); IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line)); IJsonObject objarr = jsonReader.ReadObject(); foreach (KeyValuePair <string, IJsonValue> o in objarr) { string label = o.Key; ICollection <CandidatePhrase> seed = new HashSet <CandidatePhrase>(); IJsonArray arr = objarr.GetJsonArray(o.Key); for (int i = 0; i < arr.Count; i++) { string seedw = arr.GetString(i); System.Console.Out.WriteLine("adding " + seedw + " to seed "); seed.Add(CandidatePhrase.CreateOrGet(seedw)); } Sharpen.Collections.AddAll(seedWords[label], seed); constVars.AddSeedWords(label, seed); GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seed, constVars, false); } //model.labelWords(label, labelclass, Data.sents, seed); return("SUCCESS added new phrases"); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="System.MemberAccessException"/> /// <exception cref="System.Exception"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> /// <exception cref="Java.Lang.InstantiationException"/> /// <exception cref="System.MissingMethodException"/> /// <exception cref="System.Reflection.TargetInvocationException"/> public virtual string SuggestPhrases() { ResetPatternLabelsInSents(Data.sents); GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(props, Data.sents, seedWords, false, humanLabelClasses); //model.constVars.numIterationsForPatterns = 2; model.IterateExtractApply(); return(model.constVars.GetLearnedWordsAsJson()); }
/// <exception cref="System.Exception"/> private static void DescendantsHelper(SemanticGraph g, IndexedWord curr, ICollection <IndexedWord> descendantSet, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, IList <IndexedWord> seenNodes, bool ignoreCommonTags, IPredicate <CoreLabel > acceptWord, CollectionValuedMap <int, string> feat) { if (seenNodes.Contains(curr)) { return; } seenNodes.Add(curr); if (descendantSet.Contains(curr) || (doNotAddThese != null && doNotAddThese.Contains(curr)) || !acceptWord.Test(curr.BackingLabel())) { return; } if (!ignoreCommonTags || !ignoreTags.Contains(curr.Tag().Trim())) { descendantSet.Add(curr); } foreach (IndexedWord child in g.GetChildren(curr)) { bool dontuse = false; if (doNotAddThese != null && doNotAddThese.Contains(child)) { dontuse = true; } GrammaticalRelation rel = null; if (dontuse == false) { rel = g.Reln(curr, child); dontuse = CheckIfSatisfiesRelConstrains(g, curr, child, allCutOffRels, rel); } if (dontuse == false) { foreach (string cutOffTagRegex in cutoffTags) { if (child.Tag().Matches(cutOffTagRegex)) { if (Debug >= 5) { System.Console.Out.WriteLine("ignored tag " + child + " because it satisfied " + cutOffTagRegex); } dontuse = true; break; } } } if (dontuse == false) { if (!feat.Contains(curr.Index())) { feat[curr.Index()] = new List <string>(); } GetPatternsFromDataMultiClass.GetFeatures(g, curr, false, feat[curr.Index()], rel); //feat.add(curr.index(), "REL-" + rel.getShortName()); DescendantsHelper(g, child, descendantSet, allCutOffRels, doNotAddThese, seenNodes, ignoreCommonTags, acceptWord, feat); } } }
//the format of the line input is json string of maps. required keys are "input" and "seedWords". "input" can be a string or file (in which case readFile should be true.) // For example: {"input":"presidents.txt","seedWords":{"name":["Obama"],"place":["Chicago"]}} /// <exception cref="System.IO.IOException"/> /// <exception cref="Java.Lang.InstantiationException"/> /// <exception cref="System.Reflection.TargetInvocationException"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> /// <exception cref="Java.Sql.SQLException"/> /// <exception cref="System.Exception"/> /// <exception cref="System.MemberAccessException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="System.MissingMethodException"/> public virtual string ProcessText(bool writeOutputToFile) { logger.Info("Starting to process text"); logger.Info("all seed words are " + seedWords); Pair <IDictionary <string, DataInstance>, IDictionary <string, DataInstance> > sentsPair = GetPatternsFromDataMultiClass.ProcessSents(props, seedWords.Keys); Data.sents = sentsPair.First(); ConstantsAndVariables constVars = new ConstantsAndVariables(props, seedWords.Keys, machineAnswerClasses); foreach (string label in seedWords.Keys) { GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seedWords[label], constVars, true); } if (writeOutputToFile) { GetPatternsFromDataMultiClass.WriteColumnOutput(outputFile, false, humanLabelClasses); System.Console.Out.WriteLine("written the output to " + outputFile); } logger.Info("Finished processing text"); return("SUCCESS"); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public override ICounter <E> Score() { ICounter <CandidatePhrase> externalWordWeightsNormalized = null; if (constVars.dictOddsWeights.Contains(label)) { externalWordWeightsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false); } ICounter <E> currentPatternWeights4Label = new ClassicCounter <E>(); bool useFreqPhraseExtractedByPat = false; if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio)) { useFreqPhraseExtractedByPat = true; } IToDoubleFunction <Pair <E, CandidatePhrase> > numeratorScore = null; ICounter <E> numeratorPatWt = this.Convert2OneDim(label, numeratorScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, null, useFreqPhraseExtractedByPat); ICounter <E> denominatorPatWt = null; IToDoubleFunction <Pair <E, CandidatePhrase> > denoScore; if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegUnlabOdds)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.RatioAll)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PosNegOdds)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring. Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, constVars.sqrtPatScore, true, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.SqrtAllRatio)) { denoScore = null; denominatorPatWt = this.Convert2OneDim(label, denoScore, allCandidatePhrases, patternsandWords4Label, true, false, externalWordWeightsNormalized, useFreqPhraseExtractedByPat); } else { throw new Exception("Cannot understand patterns scoring"); } } } } } currentPatternWeights4Label = Counters.DivisionNonNaN(numeratorPatWt, denominatorPatWt); //Multiplying by logP if (patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) { ICounter <E> logpos_i = new ClassicCounter <E>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet()) { logpos_i.SetCount(en.Key, Math.Log(en.Value.Size())); } Counters.MultiplyInPlace(currentPatternWeights4Label, logpos_i); } Counters.RetainNonZeros(currentPatternWeights4Label); return(currentPatternWeights4Label); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat) { // if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) { // Data.loadGoogleNGrams(); // } ICounter <E> patterns = new ClassicCounter <E>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>(); double externalWtsDefault = 0.5; ICounter <string> classifierScores = null; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { foreach (CandidatePhrase gc in allCandidatePhrases) { string g = gc.GetPhrase(); if (constVars.usePatternEvalEditDistOther) { editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePatternEvalEditDistSame) { editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc)); } if (constVars.usePatternEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g)); } } if (constVars.usePatternEvalWordClass) { int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, externalWtsDefault); } } } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); } if (constVars.usePatternEvalDomainNgram) { domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); } if (constVars.usePatternEvalWordClass) { externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { Properties props2 = new Properties(); props2.PutAll(props); props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt"); ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars); System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile")); ArgumentParser.FillOptions(typeof(Data), props2); classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true); } } ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet()) { foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet()) { CandidatePhrase word = en2.Key; ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); double score = 1; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { if (cachedScoresForThisIter.ContainsKey(word)) { score = cachedScoresForThisIter.GetCount(word); } else { if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word)) { score = 1; } else { if (constVars.usePatternEvalSemanticOdds) { double semanticClassOdds = 1; if (dictOddsWordWeights.ContainsKey(word)) { semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds); } if (constVars.usePatternEvalGoogleNgram) { double gscore = 0; if (googleNgramNormScores.ContainsKey(word)) { gscore = 1 - googleNgramNormScores.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore); } if (constVars.usePatternEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = 1 - domainNgramNormScores.GetCount(word); } else { domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePatternEvalWordClass) { double externalFeatureWt = externalWtsDefault; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePatternEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word)); } if (constVars.usePatternEvalEditDistSame) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word)); } // taking average score = Counters.Mean(scoreslist); phInPatScores.SetCounter(word, scoreslist); } cachedScoresForThisIter.SetCount(word, score); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { score = 1 - classifierScores.GetCount(word); } } // score = 1 - scorePhrases.scoreUsingClassifer(classifier, // e.getKey(), label, true, null, null, dictOddsWordWeights); // throw new RuntimeException("not implemented yet"); if (useFreqPhraseExtractedByPat) { score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word)); } if (constVars.sqrtPatScore) { patterns.IncrementCount(en.Key, Math.Sqrt(score)); } else { patterns.IncrementCount(en.Key, score); } } } return(patterns); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual void SetUpProperties(string line, bool readFile, bool writeOutputToFile, string additionalSeedWordsFiles) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line)); IJsonObject objarr = jsonReader.ReadObject(); jsonReader.Close(); Properties props = new Properties(); foreach (string o in objarr.Keys) { if (o.Equals("seedWords")) { IJsonObject obj = objarr.GetJsonObject(o); foreach (string st in obj.Keys) { seedWords[st] = new HashSet <CandidatePhrase>(); IJsonArray arr = obj.GetJsonArray(st); for (int i = 0; i < arr.Count; i++) { string val = arr.GetString(i); seedWords[st].Add(CandidatePhrase.CreateOrGet(val)); System.Console.Out.WriteLine("adding " + val + " for label " + st); } } } else { props.SetProperty(o, objarr.GetString(o)); } } System.Console.Out.WriteLine("seedwords are " + seedWords); if (additionalSeedWordsFiles != null && !additionalSeedWordsFiles.IsEmpty()) { IDictionary <string, ICollection <CandidatePhrase> > additionalSeedWords = GetPatternsFromDataMultiClass.ReadSeedWords(additionalSeedWordsFiles); logger.Info("additional seed words are " + additionalSeedWords); foreach (string label in seedWords.Keys) { if (additionalSeedWords.Contains(label)) { Sharpen.Collections.AddAll(seedWords[label], additionalSeedWords[label]); } } } outputFile = null; if (readFile) { System.Console.Out.WriteLine("input value is " + objarr.GetString("input")); outputFile = props.GetProperty("input") + "_processed"; props.SetProperty("file", objarr.GetString("input")); if (writeOutputToFile && !props.Contains("columnOutputFile")) { props.SetProperty("columnOutputFile", outputFile); } } else { string systemdir = Runtime.GetProperty("java.io.tmpdir"); File tempFile = File.CreateTempFile("sents", ".tmp", new File(systemdir)); tempFile.DeleteOnExit(); IOUtils.WriteStringToFile(props.GetProperty("input"), tempFile.GetPath(), "utf8"); props.SetProperty("file", tempFile.GetAbsolutePath()); } SetProperties(props); this.props = props; int i_1 = 1; foreach (string label_1 in seedWords.Keys) { string ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i_1; Type mcCl = (Type)Sharpen.Runtime.GetType(ansclstr); machineAnswerClasses[label_1] = mcCl; string humanansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternHumanLabel" + i_1; humanLabelClasses[label_1] = (Type)Sharpen.Runtime.GetType(humanansclstr); i_1++; } }
public virtual string GetMatchedTokensByPhrase(string input) { return(GetPatternsFromDataMultiClass.MatchedTokensByPhraseJsonString(input)); }
public virtual string GetMatchedTokensByAllPhrases() { return(GetPatternsFromDataMultiClass.MatchedTokensByPhraseJsonString()); }
/// <exception cref="System.MemberAccessException"/> /// <exception cref="System.Exception"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="Java.Lang.InstantiationException"/> /// <exception cref="System.MissingMethodException"/> /// <exception cref="System.Reflection.TargetInvocationException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="Java.Sql.SQLException"/> public virtual string SuggestPhrasesTest(Properties testProps, string modelPropertiesFile, string stopWordsFile) { logger.Info("Suggesting phrases in test"); logger.Info("test properties are " + testProps); Properties runProps = StringUtils.ArgsToPropertiesWithResolve(new string[] { "-props", modelPropertiesFile }); string[] removeProperties = new string[] { "allPatternsDir", "storePatsForEachToken", "invertedIndexClass", "savePatternsWordsDir", "batchProcessSents", "outDir", "saveInvertedIndex", "removeOverLappingLabels", "numThreads" }; foreach (string s in removeProperties) { if (runProps.Contains(s)) { runProps.Remove(s); } } runProps.SetProperty("stopWordsPatternFiles", stopWordsFile); runProps.SetProperty("englishWordsFiles", stopWordsFile); runProps.SetProperty("commonWordsPatternFiles", stopWordsFile); runProps.PutAll(props); runProps.PutAll(testProps); props.PutAll(runProps); ProcessText(false); GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(runProps, Data.sents, seedWords, true, humanLabelClasses); ArgumentParser.FillOptions(model, runProps); GetPatternsFromDataMultiClass.LoadFromSavedPatternsWordsDir(model, runProps); IDictionary <string, int> alreadyLearnedIters = new Dictionary <string, int>(); foreach (string label in model.constVars.GetLabels()) { alreadyLearnedIters[label] = model.constVars.GetLearnedWordsEachIter()[label].LastEntry().Key; } if (model.constVars.learn) { // Map<String, E> p0 = new HashMap<String, SurfacePattern>(); // Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>(); // Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>(); model.IterateExtractApply(null, null, null); } IDictionary <string, ICounter <CandidatePhrase> > allExtractions = new Dictionary <string, ICounter <CandidatePhrase> >(); //Only for one label right now! string label_1 = model.constVars.GetLabels().GetEnumerator().Current; allExtractions[label_1] = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <string, DataInstance> sent in Data.sents) { StringBuilder str = new StringBuilder(); foreach (CoreLabel l in sent.Value.GetTokens()) { if (l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null && !l.Get(typeof(PatternsAnnotations.MatchedPatterns)).IsEmpty()) { str.Append(" " + l.Word()); } else { allExtractions[label_1].IncrementCount(CandidatePhrase.CreateOrGet(str.ToString().Trim())); str.Length = 0; } } } allExtractions.PutAll(model.matchedSeedWords); return(model.constVars.GetSetWordsAsJson(allExtractions)); }
internal override ICounter <CandidatePhrase> ScorePhrases(string label, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase > alreadyIdentifiedWords, bool forLearningPatterns) { IDictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > scores = new Dictionary <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> >(); if (Data.domainNGramsFile != null) { Data.LoadDomainNGrams(); } Redwood.Log(ConstantsAndVariables.extremedebug, "Considering terms: " + terms.FirstKeySet()); // calculate TF-IDF like scores ICounter <CandidatePhrase> tfidfScores = new ClassicCounter <CandidatePhrase>(); if (constVars.usePhraseEvalPatWtByFreq) { foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { double score = GetPatTFIDFScore(en.Key, en.Value, allSelectedPatterns); tfidfScores.SetCount(en.Key, score); } Redwood.Log(ConstantsAndVariables.extremedebug, "BEFORE IDF " + Counters.ToSortedString(tfidfScores, 100, "%1$s:%2$f", "\t")); Counters.DivideInPlace(tfidfScores, Data.processedDataFreq); } ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceOtherBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceSameBinaryScores = new ClassicCounter <CandidatePhrase>(); foreach (CandidatePhrase gc in terms.FirstKeySet()) { string g = gc.GetPhrase(); if (constVars.usePhraseEvalEditDistOther) { editDistanceOtherBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePhraseEvalEditDistSame) { editDistanceSameBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePhraseEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, GetDomainNgramScore(g)); } else { log.Info("why is " + g + " not present in domainNgram"); } } if (constVars.usePhraseEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, GetGoogleNgramScore(gc)); } if (constVars.usePhraseEvalWordClass) { // calculate dist sim weights int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, OOVExternalFeatWt); } } } ICounter <CandidatePhrase> normTFIDFScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(tfidfScores, true, true, false); ICounter <CandidatePhrase> dictOdddsScores = null; if (constVars.usePhraseEvalSemanticOdds) { System.Diagnostics.Debug.Assert(constVars.dictOddsWeights != null, "usePhraseEvalSemanticOdds is true but dictOddsWeights is null for the label " + label); dictOdddsScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(constVars.dictOddsWeights[label], true, true, false); } domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); // Counters.max(googleNgramNormScores); // Counters.max(externalFeatWtsNormalized); foreach (CandidatePhrase word in terms.FirstKeySet()) { if (alreadyIdentifiedWords.Contains(word)) { continue; } ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); System.Diagnostics.Debug.Assert(normTFIDFScores.ContainsKey(word), "NormTFIDF score does not contain" + word); double tfscore = normTFIDFScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Patwtbyfreq, tfscore); if (constVars.usePhraseEvalSemanticOdds) { double dscore; if (dictOdddsScores.ContainsKey(word)) { dscore = dictOdddsScores.GetCount(word); } else { dscore = GetPhraseWeightFromWords(dictOdddsScores, word, OOVdictOdds); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, dscore); } if (constVars.usePhraseEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = domainNgramNormScores.GetCount(word); } else { domainscore = GetPhraseWeightFromWords(domainNgramNormScores, word, OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePhraseEvalGoogleNgram) { double googlescore; if (googleNgramNormScores.ContainsKey(word)) { googlescore = googleNgramNormScores.GetCount(word); } else { googlescore = GetPhraseWeightFromWords(googleNgramNormScores, word, OOVGoogleNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, googlescore); } if (constVars.usePhraseEvalWordClass) { double externalFeatureWt; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = externalFeatWtsNormalized.GetCount(word); } else { externalFeatureWt = GetPhraseWeightFromWords(externalFeatWtsNormalized, word, OOVExternalFeatWt); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePhraseEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceOtherBinaryScores.ContainsKey(word), "How come no edit distance info?"); double editD = editDistanceOtherBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editD); } if (constVars.usePhraseEvalEditDistSame) { double editDSame = editDistanceSameBinaryScores.GetCount(word); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDSame); } if (constVars.usePhraseEvalWordShape) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Wordshape, this.GetWordShapeScore(word.GetPhrase(), label)); } scores[word] = scoreslist; phraseScoresNormalized.SetCounter(word, scoreslist); } ICounter <CandidatePhrase> phraseScores = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <CandidatePhrase, ICounter <ConstantsAndVariables.ScorePhraseMeasures> > wEn in scores) { double avgScore = Counters.Mean(wEn.Value); if (!avgScore.IsInfinite() && !double.IsNaN(avgScore)) { phraseScores.SetCount(wEn.Key, avgScore); } else { Redwood.Log(Redwood.Dbg, "Ignoring " + wEn.Key + " because score is " + avgScore); } } return(phraseScores); }
/// <summary>creates all patterns and saves them in the correct PatternsForEachToken* class appropriately</summary> /// <param name="sents"/> /// <param name="props"/> /// <param name="storePatsForEachTokenWay"/> public virtual void GetAllPatterns(IDictionary <string, DataInstance> sents, Properties props, ConstantsAndVariables.PatternForEachTokenWay storePatsForEachTokenWay) { // this.patternsForEachToken = new HashMap<String, Map<Integer, Triple<Set<Integer>, Set<Integer>, Set<Integer>>>>(); // this.patternsForEachToken = new HashMap<String, Map<Integer, Set<Integer>>>(); DateTime startDate = new DateTime(); IList <string> keyset = new List <string>(sents.Keys); int num; if (constVars.numThreads == 1) { num = keyset.Count; } else { num = keyset.Count / (constVars.numThreads); } IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads); Redwood.Log(ConstantsAndVariables.extremedebug, "Computing all patterns. keyset size is " + keyset.Count + ". Assigning " + num + " values to each thread"); IList <IFuture <bool> > list = new List <IFuture <bool> >(); for (int i = 0; i < constVars.numThreads; i++) { int from = i * num; int to = -1; if (i == constVars.numThreads - 1) { to = keyset.Count; } else { to = Math.Min(keyset.Count, (i + 1) * num); } // // Redwood.log(ConstantsAndVariables.extremedebug, "assigning from " + i * num // + " till " + Math.min(keyset.size(), (i + 1) * num)); IList <string> ids = keyset.SubList(from, to); ICallable <bool> task = new CreatePatterns.CreatePatternsThread(this, sents, ids, props, storePatsForEachTokenWay); IFuture <bool> submit = executor.Submit(task); list.Add(submit); } // Now retrieve the result foreach (IFuture <bool> future in list) { try { future.Get(); } catch (Exception e) { //patternsForEachToken.putAll(future.get()); executor.ShutdownNow(); throw new Exception(e); } } executor.Shutdown(); DateTime endDate = new DateTime(); string timeTaken = GetPatternsFromDataMultiClass.ElapsedTime(startDate, endDate); Redwood.Log(Redwood.Dbg, "Done computing all patterns [" + timeTaken + "]"); }
//Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex public virtual void PrintSubGraph(SemanticGraph g, IndexedWord w, IList <string> additionalCutOffRels, IList <string> textTokens, ICollection <string> listOfOutput, ICollection <IntPair> listOfOutputIndices, IList <IndexedWord> seenNodes, IList <IndexedWord > doNotAddThese, bool findSubTrees, ICollection <ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, IPredicate <CoreLabel> acceptWord) { try { if (seenNodes.Contains(w)) { return; } seenNodes.Add(w); if (doNotAddThese.Contains(w)) { return; } IList <IndexedWord> andNodes = new List <IndexedWord>(); DescendantsWithReln(g, w, "conj_and", new List <IndexedWord>(), andNodes); //System.out.println("and nodes are " + andNodes); foreach (IndexedWord w1 in andNodes) { PrintSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord); } Sharpen.Collections.AddAll(doNotAddThese, andNodes); IList <string> allCutOffRels = new List <string>(); if (additionalCutOffRels != null) { Sharpen.Collections.AddAll(allCutOffRels, additionalCutOffRels); } Sharpen.Collections.AddAll(allCutOffRels, cutoffRelations); CollectionValuedMap <int, string> featPerToken = new CollectionValuedMap <int, string>(); ICollection <string> feat = new List <string>(); GetPatternsFromDataMultiClass.GetFeatures(g, w, true, feat, null); ICollection <IndexedWord> words = Descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken); // words.addAll(andNodes); // if (includeSiblings == true) { // for (IndexedWord ws : g.getSiblings(w)) { // if (additionalCutOffNodes == null // || !additionalCutOffNodes.contains(g.reln(g.getParent(w), // ws).getShortName())) // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese)); // } // } // if(afterand != null){ // Set<IndexedWord> wordsAnd = descendants(g,afterand, // additionalCutOffNodes); // words.removeAll(wordsAnd); // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes); // } //System.out.println("words are " + words); if (words.Count > 0) { int min = int.MaxValue; int max = -1; foreach (IndexedWord word in words) { if (word.Index() < min) { min = word.Index(); } if (word.Index() > max) { max = word.Index(); } } IntPair indices; // Map<Integer, String> ph = new TreeMap<Integer, String>(); // String phrase = ""; // for (IndexedWord word : words) { // ph.put(word.index(), word.value()); // } // phrase = StringUtils.join(ph.values(), " "); if ((max - min + 1) > maxPhraseLength) { max = min + maxPhraseLength - 1; } indices = new IntPair(min - 1, max - 1); string phrase = StringUtils.Join(textTokens.SubList(min - 1, max), " "); phrase = phrase.Trim(); feat.Add("LENGTH-" + (max - min + 1)); for (int i = min; i <= max; i++) { Sharpen.Collections.AddAll(feat, featPerToken[i]); } //System.out.println("phrase is " + phrase + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words); ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.AsCounter(feat)); if (!listOfOutput.Contains(phrase) && !doNotAddThese.Contains(phrase)) { // if (sentElem != null) { // Element node = new Element(elemString, curNS); // node.addContent(phrase); // sentElem.addContent(node); // } listOfOutput.Add(phrase); if (!listOfOutputIndices.Contains(indices)) { listOfOutputIndices.Add(indices); extractedPhrases.Add(extractedPh); } if (findSubTrees == true) { foreach (IndexedWord word_1 in words) { if (!seenNodes.Contains(word_1)) { PrintSubGraph(g, word_1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord); } } } } } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }