/* * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq, TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted, * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{ * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>(); * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>(); * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList(); * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an"); * * for(Entry<Integer, Double> en: patterns.entrySet()){ * Integer pindex = en.getKey(); * SurfacePattern p = constVars.getPatternIndex().get(pindex); * String[] n = p.getSimplerTokensNext(); * String[] pr = p.getSimplerTokensPrev(); * boolean rest = false; * if(n!=null){ * for(String e: n){ * if(!specialWords.contains(e)){ * rest = true; * break; * } * } * } * if(rest == false && pr!=null){ * for(String e: pr){ * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){ * rest = true; * break; * } * } * } * if(rest) * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue()); * else * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue()); * } * * * * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex()); * * if (constVars.batchProcessSents) { * List<File> filesToLoad; * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0) * filesToLoad = Data.sentsFiles; * else{ * filesToLoad = new ArrayList<File>(); * for (String fname : sentidswithfilerest.keySet()) { * String filename; * // if(!constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname; * // else * filename = fname; * filesToLoad.add(new File(filename)); * } * } * * for (File fname : filesToLoad) { * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname); * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname); * * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){ * * String filename; * // if(constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname.getName(); * // else * filename = fname.getAbsolutePath(); * * Set<String> sentIDs = sentidswithfilerest.get(filename); * if (sentIDs != null){ * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * Redwood.log(Redwood.DBG, "No sentIds for " + filename + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * * if (computeDataFreq){ * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(fname.getName()); * } * } * * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error. * if(computeDataFreq){ * for(File f: Data.sentsFiles){ * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){ * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f); * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(f.getName()); * } * } * } * * } else { * * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) { * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0); * Set<String> sentids = sentidswithfilerest.get(filename); * if (sentids != null) { * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * throw new RuntimeException("How come no sentIds for " + filename + ". Index keyset is " + constVars.invertedIndex.getKeySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound); * } * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size()); * } */ private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted) { foreach (KeyValuePair <string, DataInstance> sentEn in sents) { IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key); if (pat4Sent == null) { throw new Exception("How come there are no patterns for " + sentEn.Key); } foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent) { CoreLabel token = null; ICollection <E> p1 = en.Value; // Set<Integer> p1 = en.getValue().first(); // Set<Integer> p2 = en.getValue().second(); // Set<Integer> p3 = en.getValue().third(); foreach (E index in patternsLearnedThisIter.KeySet()) { if (p1.Contains(index)) { if (token == null) { token = sentEn.Value.GetTokens()[en.Key]; } wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index); } } } } }
/// <exception cref="System.IO.IOException"/> public virtual void Process() { SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(); Tree t; foreach (File file in fileList) { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), AncoraEncoding)); ITreeReader tr = trf.NewTreeReader(@in); // Tree reading will implicitly perform tree normalization for us while ((t = tr.ReadTree()) != null) { // Update tagger with this tree IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel leafLabel in yield) { if (leafLabel.Tag().Equals(SpanishTreeNormalizer.MwTag)) { continue; } unigramTagger.IncrementCount(leafLabel.Word(), leafLabel.Tag()); } } } }
private static void UpdateTagger(TwoDimensionalCounter <string, string> tagger, Tree t) { IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel label in yield) { if (label.Tag().Equals(SpanishTreeNormalizer.MwTag)) { continue; } tagger.IncrementCount(label.Word(), label.Tag()); } }
public static void UpdateTagger(TwoDimensionalCounter <string, string> tagger, Tree t) { IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel cl in yield) { if (ResolveDummyTags && cl.Tag().Equals(FrenchXMLTreeReader.MissingPos)) { continue; } else { tagger.IncrementCount(cl.Word(), cl.Tag()); } } }
public static void CountMWEStatistics(Tree t, TwoDimensionalCounter <string, string> unigramTagger, TwoDimensionalCounter <string, string> labelPreterm, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> labelTerm, TwoDimensionalCounter <string, string> termLabel) { UpdateTagger(unigramTagger, t); //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); if (ResolveDummyTags && label.Equals(FrenchXMLTreeReader.MissingPhrasal)) { continue; } string preterm = SentenceUtils.ListToString(match.PreTerminalYield()); string term = SentenceUtils.ListToString(match.Yield()); labelPreterm.IncrementCount(label, preterm); pretermLabel.IncrementCount(preterm, label); labelTerm.IncrementCount(label, term); termLabel.IncrementCount(term, label); } }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>(); ICollection <string> uniquePOSSequences = Generics.NewHashSet(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); TregexPattern pMWE = TregexPattern.Compile("/^MW/"); for (Tree t; (t = tr.ReadTree()) != null;) { //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); IList <CoreLabel> yield = match.TaggedLabeledYield(); StringBuilder termYield = new StringBuilder(); StringBuilder posYield = new StringBuilder(); foreach (CoreLabel cl in yield) { termYield.Append(cl.Word()).Append(" "); posYield.Append(cl.Tag()).Append(" "); } mweLabelToString.IncrementCount(label, termYield.ToString().Trim()); uniquePOSSequences.Add(posYield.ToString().Trim()); } } tr.Close(); //Closes the underlying reader System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n"); double nMWEs = mweLabelToString.TotalCount(); int nAllSingletons = 0; int nTokens = 0; foreach (string mweLabel in mweLabelToString.FirstKeySet()) { int nSingletons = 0; double totalCount = mweLabelToString.TotalCount(mweLabel); ICounter <string> mc = mweLabelToString.GetCounter(mweLabel); foreach (string term in mc.KeySet()) { if (mc.GetCount(term) == 1.0) { nSingletons++; } nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term); } nAllSingletons += nSingletons; System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs); } System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs); System.Console.Out.WriteLine("#tokens = " + nTokens); System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { DataInstance sent = sents[sentid]; IList <CoreLabel> tokens = sent.GetTokens(); foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } SemanticGraph graph = ((DataInstanceDep)sent).GetGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label); foreach (ExtractedPhrase match in matched) { int s = match.startIndex; int e = match.endIndex + 1; string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label]) { s = i; } else { //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); break; } } for (int i_1 = e; i_1 < tokens.Count; i_1++) { if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label]) { e = i_1; } else { //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = tokens[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } Pattern pSur = pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0); } } } } } return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }
public virtual double[] GetInformationGains() { // assert size > 0; // data = trimToSize(data); // Don't need to trim to size, and trimming is dangerous the dataset is empty (you can't add to it thereafter) labels = TrimToSize(labels); // counts the number of times word X is present ClassicCounter <F> featureCounter = new ClassicCounter <F>(); // counts the number of time a document has label Y ClassicCounter <L> labelCounter = new ClassicCounter <L>(); // counts the number of times the document has label Y given word X is present TwoDimensionalCounter <F, L> condCounter = new TwoDimensionalCounter <F, L>(); for (int i = 0; i < labels.Length; i++) { labelCounter.IncrementCount(labelIndex.Get(labels[i])); // convert the document to binary feature representation bool[] doc = new bool[featureIndex.Size()]; //logger.info(i); for (int j = 0; j < data[i].Length; j++) { doc[data[i][j]] = true; } for (int j_1 = 0; j_1 < doc.Length; j_1++) { if (doc[j_1]) { featureCounter.IncrementCount(featureIndex.Get(j_1)); condCounter.IncrementCount(featureIndex.Get(j_1), labelIndex.Get(labels[i]), 1.0); } } } double entropy = 0.0; for (int i_1 = 0; i_1 < labelIndex.Size(); i_1++) { double labelCount = labelCounter.GetCount(labelIndex.Get(i_1)); double p = labelCount / Size(); entropy -= p * (Math.Log(p) / Math.Log(2)); } double[] ig = new double[featureIndex.Size()]; Arrays.Fill(ig, entropy); for (int i_2 = 0; i_2 < featureIndex.Size(); i_2++) { F feature = featureIndex.Get(i_2); double featureCount = featureCounter.GetCount(feature); double notFeatureCount = Size() - featureCount; double pFeature = featureCount / Size(); double pNotFeature = (1.0 - pFeature); if (featureCount == 0) { ig[i_2] = 0; continue; } if (notFeatureCount == 0) { ig[i_2] = 0; continue; } double sumFeature = 0.0; double sumNotFeature = 0.0; for (int j = 0; j < labelIndex.Size(); j++) { L label = labelIndex.Get(j); double featureLabelCount = condCounter.GetCount(feature, label); double notFeatureLabelCount = Size() - featureLabelCount; // yes, these dont sum to 1. that is correct. // one is the prob of the label, given that the // feature is present, and the other is the prob // of the label given that the feature is absent double p = featureLabelCount / featureCount; double pNot = notFeatureLabelCount / notFeatureCount; if (featureLabelCount != 0) { sumFeature += p * (Math.Log(p) / Math.Log(2)); } if (notFeatureLabelCount != 0) { sumNotFeature += pNot * (Math.Log(pNot) / Math.Log(2)); } } //System.out.println(pNot+" "+(Math.log(pNot)/Math.log(2))); //logger.info(pFeature+" * "+sumFeature+" = +"+); //logger.info("^ "+pNotFeature+" "+sumNotFeature); ig[i_2] += pFeature * sumFeature + pNotFeature * sumNotFeature; } /* earlier the line above used to be: ig[i] = pFeature*sumFeature + pNotFeature*sumNotFeature; * This completely ignored the entropy term computed above. So added the "+=" to take that into account. * -Ramesh ([email protected]) */ return(ig); }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); TwoDimensionalCounter <Pair <string, string>, E> allFreq = new TwoDimensionalCounter <Pair <string, string>, E>(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); //FIND_ALL is faster than FIND_NONOVERLAP IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll); foreach (ISequenceMatchResult <ICoreMap> m in matched) { int s = m.Start("$term"); int e = m.End("$term"); E matchedPat = patterns[m.Pattern()]; matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e)); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // unneeded as done on initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns))) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString())) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0); } } } // for (SurfacePattern pat : patterns.keySet()) { // String patternStr = pat.toString(); // // TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr); // if (pat == null || p == null) // throw new RuntimeException("why is the pattern " + pat + " null?"); // // TokenSequenceMatcher m = p.getMatcher(sent); // while (m.find()) { // // int s = m.start("$term"); // int e = m.end("$term"); // // String phrase = ""; // String phraseLemma = ""; // boolean useWordNotLabeled = false; // boolean doNotUse = false; // for (int i = s; i < e; i++) { // CoreLabel l = sent.get(i); // l.set(PatternsAnnotations.MatchedPattern.class, true); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } // for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) { // if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) { // doNotUse = true; // } // } // boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords); // if (removePhrasesWithStopWords && containsStop) { // doNotUse = true; // } else { // if (!containsStop || !removeStopWordsFromSelectedPhrases) { // // if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) { // useWordNotLabeled = true; // } // phrase += " " + l.word(); // phraseLemma += " " + l.lemma(); // // } // } // } // if (!doNotUse && useWordNotLabeled) { // phrase = phrase.trim(); // phraseLemma = phraseLemma.trim(); // allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0); // } // } // } return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }