//line is a jsonstring of map of label to array of strings; ex: {"name":["Bush","Carter","Obama"]} /// <exception cref="System.Exception"/> public virtual string DoNewPhrases(string line) { System.Console.Out.WriteLine("adding new phrases"); ConstantsAndVariables constVars = new ConstantsAndVariables(props, humanLabelClasses.Keys, humanLabelClasses); IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line)); IJsonObject objarr = jsonReader.ReadObject(); foreach (KeyValuePair <string, IJsonValue> o in objarr) { string label = o.Key; ICollection <CandidatePhrase> seed = new HashSet <CandidatePhrase>(); IJsonArray arr = objarr.GetJsonArray(o.Key); for (int i = 0; i < arr.Count; i++) { string seedw = arr.GetString(i); System.Console.Out.WriteLine("adding " + seedw + " to seed "); seed.Add(CandidatePhrase.CreateOrGet(seedw)); } Sharpen.Collections.AddAll(seedWords[label], seed); constVars.AddSeedWords(label, seed); GetPatternsFromDataMultiClass.RunLabelSeedWords(Data.sents, humanLabelClasses[label], label, seed, constVars, false); } //model.labelWords(label, labelclass, Data.sents, seed); return("SUCCESS added new phrases"); }
/* * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq, TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted, * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{ * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>(); * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>(); * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList(); * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an"); * * for(Entry<Integer, Double> en: patterns.entrySet()){ * Integer pindex = en.getKey(); * SurfacePattern p = constVars.getPatternIndex().get(pindex); * String[] n = p.getSimplerTokensNext(); * String[] pr = p.getSimplerTokensPrev(); * boolean rest = false; * if(n!=null){ * for(String e: n){ * if(!specialWords.contains(e)){ * rest = true; * break; * } * } * } * if(rest == false && pr!=null){ * for(String e: pr){ * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){ * rest = true; * break; * } * } * } * if(rest) * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue()); * else * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue()); * } * * * * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex()); * * if (constVars.batchProcessSents) { * List<File> filesToLoad; * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0) * filesToLoad = Data.sentsFiles; * else{ * filesToLoad = new ArrayList<File>(); * for (String fname : sentidswithfilerest.keySet()) { * String filename; * // if(!constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname; * // else * filename = fname; * filesToLoad.add(new File(filename)); * } * } * * for (File fname : filesToLoad) { * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname); * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname); * * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){ * * String filename; * // if(constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname.getName(); * // else * filename = fname.getAbsolutePath(); * * Set<String> sentIDs = sentidswithfilerest.get(filename); * if (sentIDs != null){ * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * Redwood.log(Redwood.DBG, "No sentIds for " + filename + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * * if (computeDataFreq){ * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(fname.getName()); * } * } * * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error. * if(computeDataFreq){ * for(File f: Data.sentsFiles){ * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){ * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f); * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(f.getName()); * } * } * } * * } else { * * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) { * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0); * Set<String> sentids = sentidswithfilerest.get(filename); * if (sentids != null) { * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * throw new RuntimeException("How come no sentIds for " + filename + ". Index keyset is " + constVars.invertedIndex.getKeySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound); * } * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size()); * } */ private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted) { foreach (KeyValuePair <string, DataInstance> sentEn in sents) { IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key); if (pat4Sent == null) { throw new Exception("How come there are no patterns for " + sentEn.Key); } foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent) { CoreLabel token = null; ICollection <E> p1 = en.Value; // Set<Integer> p1 = en.getValue().first(); // Set<Integer> p2 = en.getValue().second(); // Set<Integer> p3 = en.getValue().third(); foreach (E index in patternsLearnedThisIter.KeySet()) { if (p1.Contains(index)) { if (token == null) { token = sentEn.Value.GetTokens()[en.Key]; } wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index); } } } } }
public static void ComputeRawFreqIfNull(IDictionary <string, DataInstance> sents, int numWordsCompound) { Redwood.Log(Redwood.Dbg, "Computing raw freq for every 1-" + numWordsCompound + " consecutive words"); foreach (DataInstance l in sents.Values) { IList <IList <CoreLabel> > ngrams = CollectionUtils.GetNGrams(l.GetTokens(), 1, numWordsCompound); foreach (IList <CoreLabel> n in ngrams) { string s = string.Empty; foreach (CoreLabel c in n) { // if (useWord(c, commonEngWords, ignoreWordRegex)) { s += " " + c.Word(); } // } s = s.Trim(); if (!s.IsEmpty()) { Data.rawFreq.IncrementCount(CandidatePhrase.CreateOrGet(s)); } } } //if (googleNGram != null && googleNGram.size() > 0) if (usingGoogleNgram) { SetRatioGoogleNgramFreqWithDataFreq(); } if (domainNGramRawFreq != null && domainNGramRawFreq.Size() > 0) { ratioDomainNgramFreqWithDataFreq = domainNGramRawFreq.TotalCount() / Data.rawFreq.TotalCount(); } }
internal virtual double GetPatTFIDFScore(CandidatePhrase word, ICounter <E> patsThatExtractedThis, ICounter <E> allSelectedPatterns) { if (Data.processedDataFreq.GetCount(word) == 0.0) { Redwood.Log(Redwood.Warn, "How come the processed corpus freq has count of " + word + " 0. The count in raw freq is " + Data.rawFreq.GetCount(word) + " and the Data.rawFreq size is " + Data.rawFreq.Size()); return(0); } else { double total = 0; ICollection <E> rem = new HashSet <E>(); foreach (KeyValuePair <E, double> en2 in patsThatExtractedThis.EntrySet()) { double weight = 1.0; if (usePatternWeights) { weight = allSelectedPatterns.GetCount(en2.Key); if (weight == 0) { Redwood.Log(Redwood.Force, "Warning: Weight zero for " + en2.Key + ". May be pattern was removed when choosing other patterns (if subsumed by another pattern)."); rem.Add(en2.Key); } } total += weight; } Counters.RemoveKeys(patsThatExtractedThis, rem); double score = total / Data.processedDataFreq.GetCount(word); return(score); } }
public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords , double thresholdWordExtract) { IEnumerator <CandidatePhrase> termIter = Counters.ToPriorityQueue(newdt).GetEnumerator(); ICounter <CandidatePhrase> finalwords = new ClassicCounter <CandidatePhrase>(); while (termIter.MoveNext()) { if (finalwords.Size() >= constVars.numWordsToAdd) { break; } CandidatePhrase w = termIter.Current; if (newdt.GetCount(w) < thresholdWordExtract) { Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of " + thresholdWordExtract); break; } System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity)); if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) { Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet()); continue; } CandidatePhrase matchedFuzzy = null; if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null) { matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern); } if (matchedFuzzy == null) { Redwood.Log("extremePatDebug", "adding word " + w); finalwords.SetCount(w, newdt.GetCount(w)); } else { Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word"); ignoreWords.Add(w); } } string nextTen = string.Empty; int n = 0; while (termIter.MoveNext()) { n++; if (n > 10) { break; } CandidatePhrase w = termIter.Current; nextTen += ";\t" + w + ":" + newdt.GetCount(w); } Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen); return(finalwords); }
public virtual double GetDictOddsScore(CandidatePhrase word, string label, double defaultWt) { double dscore; ICounter <CandidatePhrase> dictOddsWordWeights = constVars.dictOddsWeights[label]; System.Diagnostics.Debug.Assert(dictOddsWordWeights != null, "dictOddsWordWeights is null for label " + label); if (dictOddsWordWeights.ContainsKey(word)) { dscore = dictOddsWordWeights.GetCount(word); } else { dscore = GetPhraseWeightFromWords(dictOddsWordWeights, word, defaultWt); } return(dscore); }
public static double GetGoogleNgramScore(CandidatePhrase g) { double count = GoogleNGramsSQLBacked.GetCount(g.GetPhrase().ToLower()) + GoogleNGramsSQLBacked.GetCount(g.GetPhrase()); if (count != -1) { if (!Data.rawFreq.ContainsKey(g)) { //returning 1 because usually lower this tf-idf score the better. if we don't have raw freq info, give it a bad score return(1); } else { return((1 + Data.rawFreq.GetCount(g) * Math.Sqrt(Data.ratioGoogleNgramFreqWithDataFreq)) / count); } } return(0); }
public virtual double GetPhraseWeightFromWords(ICounter <CandidatePhrase> weights, CandidatePhrase ph, double defaultWt) { string[] t = ph.GetPhrase().Split("\\s+"); if (t.Length < 2) { if (weights.ContainsKey(ph)) { return(weights.GetCount(ph)); } else { return(defaultWt); } } double totalscore = 0; double minScore = double.MaxValue; foreach (string w in t) { double score = defaultWt; if (weights.ContainsKey(CandidatePhrase.CreateOrGet(w))) { score = weights.GetCount(w); } if (score < minScore) { minScore = score; } totalscore += score; } if (useAvgInsteadofMinPhraseScoring) { return(totalscore / ph.GetPhrase().Length); } else { return(minScore); } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
private double NumNonRedundantPatterns(TwoDimensionalCounter <CandidatePhrase, E> terms, CandidatePhrase w) { object[] pats = Sharpen.Collections.ToArray(terms.GetCounter(w).KeySet()); int numPat = 0; for (int i = 0; i < pats.Length; i++) { //String pati = constVars.getPatternIndex().get(pats[i]).toString(); string pati = pats[i].ToString(); bool contains = false; for (int j = i + 1; j < pats.Length; j++) { //String patj = constVars.getPatternIndex().get(pats[j]).toString(); string patj = pats[j].ToString(); if (patj.Contains(pati) || pati.Contains(patj)) { contains = true; break; } } if (!contains) { numPat++; } } return(numPat); }
public static bool DoNotUse(string word, ICollection <CandidatePhrase> stopWords) { return(stopWords.Contains(CandidatePhrase.CreateOrGet(word.ToLower())) || ignoreWordRegex.Matcher(word).Matches()); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual ICounter <E> Convert2OneDim(string label, IToDoubleFunction <Pair <E, CandidatePhrase> > scoringFunction, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> positivePatternsAndWords, bool sqrtPatScore, bool scorePhrasesInPatSelection, ICounter <CandidatePhrase> dictOddsWordWeights, bool useFreqPhraseExtractedByPat) { // if (Data.googleNGram.size() == 0 && Data.googleNGramsFile != null) { // Data.loadGoogleNGrams(); // } ICounter <E> patterns = new ClassicCounter <E>(); ICounter <CandidatePhrase> googleNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> domainNgramNormScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> externalFeatWtsNormalized = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromOtherSemanticBinaryScores = new ClassicCounter <CandidatePhrase>(); ICounter <CandidatePhrase> editDistanceFromAlreadyExtractedBinaryScores = new ClassicCounter <CandidatePhrase>(); double externalWtsDefault = 0.5; ICounter <string> classifierScores = null; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { foreach (CandidatePhrase gc in allCandidatePhrases) { string g = gc.GetPhrase(); if (constVars.usePatternEvalEditDistOther) { editDistanceFromOtherSemanticBinaryScores.SetCount(gc, constVars.GetEditDistanceScoresOtherClassThreshold(label, g)); } if (constVars.usePatternEvalEditDistSame) { editDistanceFromAlreadyExtractedBinaryScores.SetCount(gc, 1 - constVars.GetEditDistanceScoresThisClassThreshold(label, g)); } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores.SetCount(gc, PhraseScorer.GetGoogleNgramScore(gc)); } if (constVars.usePatternEvalDomainNgram) { // calculate domain-ngram wts if (Data.domainNGramRawFreq.ContainsKey(g)) { System.Diagnostics.Debug.Assert((Data.rawFreq.ContainsKey(gc))); domainNgramNormScores.SetCount(gc, scorePhrases.phraseScorer.GetDomainNgramScore(g)); } } if (constVars.usePatternEvalWordClass) { int num = constVars.GetWordClassClusters()[g]; if (num == null) { num = constVars.GetWordClassClusters()[g.ToLower()]; } if (num != null && constVars.distSimWeights[label].ContainsKey(num)) { externalFeatWtsNormalized.SetCount(gc, constVars.distSimWeights[label].GetCount(num)); } else { externalFeatWtsNormalized.SetCount(gc, externalWtsDefault); } } } if (constVars.usePatternEvalGoogleNgram) { googleNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(googleNgramNormScores, true, true, false); } if (constVars.usePatternEvalDomainNgram) { domainNgramNormScores = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(domainNgramNormScores, true, true, false); } if (constVars.usePatternEvalWordClass) { externalFeatWtsNormalized = GetPatternsFromDataMultiClass.NormalizeSoftMaxMinMaxScores(externalFeatWtsNormalized, true, true, false); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { Properties props2 = new Properties(); props2.PutAll(props); props2.SetProperty("phraseScorerClass", "edu.stanford.nlp.patterns.ScorePhrasesLearnFeatWt"); ScorePhrases scoreclassifier = new ScorePhrases(props2, constVars); System.Console.Out.WriteLine("file is " + props.GetProperty("domainNGramsFile")); ArgumentParser.FillOptions(typeof(Data), props2); classifierScores = scoreclassifier.phraseScorer.ScorePhrases(label, allCandidatePhrases, true); } } ICounter <CandidatePhrase> cachedScoresForThisIter = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in positivePatternsAndWords.EntrySet()) { foreach (KeyValuePair <CandidatePhrase, double> en2 in en.Value.EntrySet()) { CandidatePhrase word = en2.Key; ICounter <ConstantsAndVariables.ScorePhraseMeasures> scoreslist = new ClassicCounter <ConstantsAndVariables.ScorePhraseMeasures>(); double score = 1; if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPat) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.PhEvalInPatLogP)) && scorePhrasesInPatSelection) { if (cachedScoresForThisIter.ContainsKey(word)) { score = cachedScoresForThisIter.GetCount(word); } else { if (constVars.GetOtherSemanticClassesWords().Contains(word) || constVars.GetCommonEngWords().Contains(word)) { score = 1; } else { if (constVars.usePatternEvalSemanticOdds) { double semanticClassOdds = 1; if (dictOddsWordWeights.ContainsKey(word)) { semanticClassOdds = 1 - dictOddsWordWeights.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Semanticodds, semanticClassOdds); } if (constVars.usePatternEvalGoogleNgram) { double gscore = 0; if (googleNgramNormScores.ContainsKey(word)) { gscore = 1 - googleNgramNormScores.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Googlengram, gscore); } if (constVars.usePatternEvalDomainNgram) { double domainscore; if (domainNgramNormScores.ContainsKey(word)) { domainscore = 1 - domainNgramNormScores.GetCount(word); } else { domainscore = 1 - scorePhrases.phraseScorer.GetPhraseWeightFromWords(domainNgramNormScores, word, scorePhrases.phraseScorer.OOVDomainNgramScore); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Domainngram, domainscore); } if (constVars.usePatternEvalWordClass) { double externalFeatureWt = externalWtsDefault; if (externalFeatWtsNormalized.ContainsKey(word)) { externalFeatureWt = 1 - externalFeatWtsNormalized.GetCount(word); } scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Distsim, externalFeatureWt); } if (constVars.usePatternEvalEditDistOther) { System.Diagnostics.Debug.Assert(editDistanceFromOtherSemanticBinaryScores.ContainsKey(word), "How come no edit distance info for word " + word + string.Empty); scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistother, editDistanceFromOtherSemanticBinaryScores.GetCount(word)); } if (constVars.usePatternEvalEditDistSame) { scoreslist.SetCount(ConstantsAndVariables.ScorePhraseMeasures.Editdistsame, editDistanceFromAlreadyExtractedBinaryScores.GetCount(word)); } // taking average score = Counters.Mean(scoreslist); phInPatScores.SetCounter(word, scoreslist); } cachedScoresForThisIter.SetCount(word, score); } } else { if ((patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.Logreg) || patternScoring.Equals(GetPatternsFromDataMultiClass.PatternScoring.LOGREGlogP)) && scorePhrasesInPatSelection) { score = 1 - classifierScores.GetCount(word); } } // score = 1 - scorePhrases.scoreUsingClassifer(classifier, // e.getKey(), label, true, null, null, dictOddsWordWeights); // throw new RuntimeException("not implemented yet"); if (useFreqPhraseExtractedByPat) { score = score * scoringFunction.ApplyAsDouble(new Pair <E, CandidatePhrase>(en.Key, word)); } if (constVars.sqrtPatScore) { patterns.IncrementCount(en.Key, Math.Sqrt(score)); } else { patterns.IncrementCount(en.Key, score); } } } return(patterns); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> internal virtual void SetUpProperties(string line, bool readFile, bool writeOutputToFile, string additionalSeedWordsFiles) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new StringReader(line)); IJsonObject objarr = jsonReader.ReadObject(); jsonReader.Close(); Properties props = new Properties(); foreach (string o in objarr.Keys) { if (o.Equals("seedWords")) { IJsonObject obj = objarr.GetJsonObject(o); foreach (string st in obj.Keys) { seedWords[st] = new HashSet <CandidatePhrase>(); IJsonArray arr = obj.GetJsonArray(st); for (int i = 0; i < arr.Count; i++) { string val = arr.GetString(i); seedWords[st].Add(CandidatePhrase.CreateOrGet(val)); System.Console.Out.WriteLine("adding " + val + " for label " + st); } } } else { props.SetProperty(o, objarr.GetString(o)); } } System.Console.Out.WriteLine("seedwords are " + seedWords); if (additionalSeedWordsFiles != null && !additionalSeedWordsFiles.IsEmpty()) { IDictionary <string, ICollection <CandidatePhrase> > additionalSeedWords = GetPatternsFromDataMultiClass.ReadSeedWords(additionalSeedWordsFiles); logger.Info("additional seed words are " + additionalSeedWords); foreach (string label in seedWords.Keys) { if (additionalSeedWords.Contains(label)) { Sharpen.Collections.AddAll(seedWords[label], additionalSeedWords[label]); } } } outputFile = null; if (readFile) { System.Console.Out.WriteLine("input value is " + objarr.GetString("input")); outputFile = props.GetProperty("input") + "_processed"; props.SetProperty("file", objarr.GetString("input")); if (writeOutputToFile && !props.Contains("columnOutputFile")) { props.SetProperty("columnOutputFile", outputFile); } } else { string systemdir = Runtime.GetProperty("java.io.tmpdir"); File tempFile = File.CreateTempFile("sents", ".tmp", new File(systemdir)); tempFile.DeleteOnExit(); IOUtils.WriteStringToFile(props.GetProperty("input"), tempFile.GetPath(), "utf8"); props.SetProperty("file", tempFile.GetAbsolutePath()); } SetProperties(props); this.props = props; int i_1 = 1; foreach (string label_1 in seedWords.Keys) { string ansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternLabel" + i_1; Type mcCl = (Type)Sharpen.Runtime.GetType(ansclstr); machineAnswerClasses[label_1] = mcCl; string humanansclstr = "edu.stanford.nlp.patterns.PatternsAnnotations$PatternHumanLabel" + i_1; humanLabelClasses[label_1] = (Type)Sharpen.Runtime.GetType(humanansclstr); i_1++; } }
/// <exception cref="System.MemberAccessException"/> /// <exception cref="System.Exception"/> /// <exception cref="Java.Util.Concurrent.ExecutionException"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="Java.Lang.InstantiationException"/> /// <exception cref="System.MissingMethodException"/> /// <exception cref="System.Reflection.TargetInvocationException"/> /// <exception cref="System.TypeLoadException"/> /// <exception cref="Java.Sql.SQLException"/> public virtual string SuggestPhrasesTest(Properties testProps, string modelPropertiesFile, string stopWordsFile) { logger.Info("Suggesting phrases in test"); logger.Info("test properties are " + testProps); Properties runProps = StringUtils.ArgsToPropertiesWithResolve(new string[] { "-props", modelPropertiesFile }); string[] removeProperties = new string[] { "allPatternsDir", "storePatsForEachToken", "invertedIndexClass", "savePatternsWordsDir", "batchProcessSents", "outDir", "saveInvertedIndex", "removeOverLappingLabels", "numThreads" }; foreach (string s in removeProperties) { if (runProps.Contains(s)) { runProps.Remove(s); } } runProps.SetProperty("stopWordsPatternFiles", stopWordsFile); runProps.SetProperty("englishWordsFiles", stopWordsFile); runProps.SetProperty("commonWordsPatternFiles", stopWordsFile); runProps.PutAll(props); runProps.PutAll(testProps); props.PutAll(runProps); ProcessText(false); GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(runProps, Data.sents, seedWords, true, humanLabelClasses); ArgumentParser.FillOptions(model, runProps); GetPatternsFromDataMultiClass.LoadFromSavedPatternsWordsDir(model, runProps); IDictionary <string, int> alreadyLearnedIters = new Dictionary <string, int>(); foreach (string label in model.constVars.GetLabels()) { alreadyLearnedIters[label] = model.constVars.GetLearnedWordsEachIter()[label].LastEntry().Key; } if (model.constVars.learn) { // Map<String, E> p0 = new HashMap<String, SurfacePattern>(); // Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>(); // Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>(); model.IterateExtractApply(null, null, null); } IDictionary <string, ICounter <CandidatePhrase> > allExtractions = new Dictionary <string, ICounter <CandidatePhrase> >(); //Only for one label right now! string label_1 = model.constVars.GetLabels().GetEnumerator().Current; allExtractions[label_1] = new ClassicCounter <CandidatePhrase>(); foreach (KeyValuePair <string, DataInstance> sent in Data.sents) { StringBuilder str = new StringBuilder(); foreach (CoreLabel l in sent.Value.GetTokens()) { if (l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null && !l.Get(typeof(PatternsAnnotations.MatchedPatterns)).IsEmpty()) { str.Append(" " + l.Word()); } else { allExtractions[label_1].IncrementCount(CandidatePhrase.CreateOrGet(str.ToString().Trim())); str.Length = 0; } } } allExtractions.PutAll(model.matchedSeedWords); return(model.constVars.GetSetWordsAsJson(allExtractions)); }