public virtual ICollection <string> GetFileSentIds(CollectionValuedMap <string, string> relevantWords) { ICollection <string> sentids = null; foreach (KeyValuePair <string, ICollection <string> > en in relevantWords) { foreach (string en2 in en.Value) { if (!stopWords.Contains(en2.ToLower())) { string w = CombineKeyValue(en.Key, en2); ICollection <string> st = index[w]; if (st == null) { //log.info("\n\nWARNING: INDEX HAS NO SENTENCES FOR " + w); return(Java.Util.Collections.EmptySet()); } //throw new RuntimeException("How come the index does not have sentences for " + w); if (sentids == null) { sentids = st; } else { sentids = CollectionUtils.Intersection(sentids, st); } } } } return(sentids); }
public virtual IList <IList <Mention> > ExtractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { IList <ICoreMap> sentences = conllDoc.GetAnnotation().Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <IList <Mention> > allGoldMentions = new List <IList <Mention> >(); CollectionValuedMap <string, ICoreMap> corefChainMap = conllDoc.GetCorefChainMap(); for (int i = 0; i < sentences.Count; i++) { allGoldMentions.Add(new List <Mention>()); } int maxCorefClusterId = -1; foreach (string corefIdStr in corefChainMap.Keys) { int id = System.Convert.ToInt32(corefIdStr); if (id > maxCorefClusterId) { maxCorefClusterId = id; } } int newMentionID = maxCorefClusterId + 1; foreach (KeyValuePair <string, ICollection <ICoreMap> > idChainEntry in corefChainMap) { int id = System.Convert.ToInt32(idChainEntry.Key); int clusterMentionCnt = 0; foreach (ICoreMap m in idChainEntry.Value) { clusterMentionCnt++; Mention mention = new Mention(); mention.goldCorefClusterID = id; if (clusterMentionCnt == 1) { // First mention in cluster mention.mentionID = id; mention.originalRef = -1; } else { mention.mentionID = newMentionID; mention.originalRef = id; newMentionID++; } if (maxID < mention.mentionID) { maxID = mention.mentionID; } int sentIndex = m.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sent = sentences[sentIndex]; mention.startIndex = m.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); mention.endIndex = m.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - sent.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); // will be set by arrange mention.originalSpan = m.Get(typeof(CoreAnnotations.TokensAnnotation)); // Mention dependency graph is the enhanced dependency graph of the sentence mention.dependency = sentences[sentIndex].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); allGoldMentions[sentIndex].Add(mention); } } return(allGoldMentions); }
/* * void runParallelApplyPats(Map<String, List<CoreLabel>> sents, Set<String> sentIds, String label, Counter<E> patternsLearnedThisIter, TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted, * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws InterruptedException, ExecutionException { * List<String> keyset = new ArrayList<String>(sentIds); * List<String> notAllowedClasses = new ArrayList<String>(); * * if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){ * for(String l: constVars.getAnswerClass().keySet()){ * if(!l.equals(label)){ * notAllowedClasses.add(l+":"+l); * } * } * notAllowedClasses.add("OTHERSEM:OTHERSEM"); * } * * //Apply the patterns and extract candidate phrases * int num = 0; * if (constVars.numThreads == 1) * num = keyset.size(); * else * num = keyset.size() / (constVars.numThreads - 1); * ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads); * List<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>> list = new ArrayList<Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>>>(); * for (int i = 0; i < constVars.numThreads; i++) { * * Callable<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> task = null; * Map<TokenSequencePattern, Integer> patternsLearnedThisIterConverted = new HashMap<TokenSequencePattern , Integer>(); * for (Integer pindex : patternsLearnedThisIter.keySet()){ * SurfacePattern p = constVars.getPatternIndex().get(pindex); * TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), p.toString(notAllowedClasses)); * patternsLearnedThisIterConverted.put(pat, pindex); * } * * task = new ApplyPatternsMulti(sents, keyset.subList(i * num, * Math.min(keyset.size(), (i + 1) * num)), patternsLearnedThisIterConverted, label, * constVars.removeStopWordsFromSelectedPhrases, * constVars.removePhrasesWithStopWords, constVars); * * Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> submit = executor * .submit(task); * list.add(submit); * } * * // Now retrieve the result * for (Future<Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>>> future : list) { * try{ * Pair<TwoDimensionalCounter<Pair<String, String>, Integer>, CollectionValuedMap<Integer, Triple<String, Integer, Integer>>> result = future * .get(); * wordsandLemmaPatExtracted.addAll(result.first()); * matchedTokensByPat.addAll(result.second()); * }catch(Exception e){ * executor.shutdownNow(); * throw new RuntimeException(e); * } * } * executor.shutdown(); * } */ protected internal virtual IDictionary <E, IDictionary <string, DataInstance> > GetSentences(IDictionary <E, ICollection <string> > sentids) { try { ICollection <File> files = new HashSet <File>(); IDictionary <E, IDictionary <string, DataInstance> > sentsAll = new Dictionary <E, IDictionary <string, DataInstance> >(); CollectionValuedMap <string, E> sentIds2Pats = new CollectionValuedMap <string, E>(); foreach (KeyValuePair <E, ICollection <string> > setEn in sentids) { if (!sentsAll.Contains(setEn.Key)) { sentsAll[setEn.Key] = new Dictionary <string, DataInstance>(); } foreach (string s in setEn.Value) { sentIds2Pats.Add(s, setEn.Key); if (constVars.batchProcessSents) { File f = Data.sentId2File[s]; System.Diagnostics.Debug.Assert(f != null, "How come no file for sentence " + s); files.Add(f); } } } if (constVars.batchProcessSents) { foreach (File f in files) { IDictionary <string, DataInstance> sentsf = IOUtils.ReadObjectFromFile(f); foreach (KeyValuePair <string, DataInstance> s in sentsf) { foreach (E pat in sentIds2Pats[s.Key]) { sentsAll[pat][s.Key] = s.Value; } } } } else { foreach (KeyValuePair <string, DataInstance> s in Data.sents) { foreach (E pat in sentIds2Pats[s.Key]) { sentsAll[pat][s.Key] = s.Value; } } } // /System.out.println("All sentences are " + sentsAll.entrySet().stream().map( x -> constVars.patternIndex.get(x.getKey())+":"+x.getValue()).collect(Collectors.toList())); return(sentsAll); } catch (TypeLoadException e) { throw new Exception(e); } catch (IOException e1) { throw new Exception(e1); } }
// SentenceIndex.SentenceIteratorWithWords queryIndex(SurfacePattern pat){ // // // String[] n = pat.getSimplerTokensNext(); // String[] pr = pat.getSimplerTokensPrev(); // boolean rest = false; // if(n!=null){ // for(String e: n){ // if(!specialWords.contains(e)){ // rest = true; // break; // } // } // } // if(rest == false && pr!=null){ // for(String e: pr){ // if(!specialWords.contains(e) && !stopWords.contains(e)){ // rest = true; // break; // } // } // } // // } /// <summary>give all sentences that have these words</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Lucene.Queryparser.Classic.ParseException"/> internal virtual ICollection <string> QueryIndexGetSentences(CollectionValuedMap <string, string> words) { SetIndexReaderSearcher(); BooleanQuery query = new BooleanQuery(); string pkey = Token.GetKeyForClass(typeof(PatternsAnnotations.ProcessedTextAnnotation)); foreach (KeyValuePair <string, ICollection <string> > en in words) { bool processedKey = en.Key.Equals(pkey); foreach (string en2 in en.Value) { if (!processedKey || !stopWords.Contains(en2.ToLower())) { query.Add(new BooleanClause(new TermQuery(new Term(en.Key, en2)), BooleanClause.Occur.Must)); } } } //query.add(new BooleanClause(new TermQuery(new Term("textannotation","sonal")), BooleanClause.Occur.MUST)); // String queryStr = ""; // for(Map.Entry<String, Collection<String>> en: words.entrySet()){ // for(String en2: en.getValue()){ // queryStr+= " " + en.getKey() + ":"+en2; // } // } // QueryParser queryParser = new QueryParser(Version.LUCENE_42, "sentence", analyzer); // // queryParser.setDefaultOperator(QueryParser.Operator.AND); // // Query query = queryParser.parse(queryStr); //Map<String, List<CoreLabel>> sents = null; TopDocs tp = searcher.Search(query, int.MaxValue); ICollection <string> sentids = new HashSet <string>(); if (tp.totalHits > 0) { foreach (ScoreDoc s in tp.scoreDocs) { int docId = s.doc; Org.Apache.Lucene.Document.Document d = searcher.Doc(docId); // byte[] sent = d.getBinaryValue("tokens").bytes; // if(saveTokens) { // sents = new HashMap<String, List<CoreLabel>>(); // List<CoreLabel> tokens = readProtoBufAnnotation(sent); // sents.put(d.get("sentid"), tokens); // } else{ sentids.Add(d.Get("sentid")); } } else { //} throw new Exception("how come no documents for " + words + ". Query formed is " + query); } //System.out.println("number of sentences for tokens " + words + " are " + sentids); // if(!saveTokens){ // sents = getSentences(sentids); // } return(sentids); }
public override CollectionValuedMap <string, string> GetRelevantWords() { CollectionValuedMap <string, string> relwordsThisPat = new CollectionValuedMap <string, string>(); foreach (Pair <Token, GrammaticalRelation> r in relations) { GetRelevantWordsBase(r.First(), relwordsThisPat); } return(relwordsThisPat); }
public override CollectionValuedMap <string, string> GetRelevantWords() { CollectionValuedMap <string, string> relwordsThisPat = new CollectionValuedMap <string, string>(); Token[] next = GetNextContext(); GetRelevantWordsBase(next, relwordsThisPat); Token[] prev = GetPrevContext(); GetRelevantWordsBase(prev, relwordsThisPat); return(relwordsThisPat); }
protected internal static void GetRelevantWordsBase(Token t, CollectionValuedMap <string, string> relWords) { if (t != null) { IDictionary <string, string> str = t.ClassORRestrictionsAsString(); if (str != null) { relWords.AddAll(str); } } }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
/// <summary>Mark twin mentions: All mention boundaries should be matched</summary> private void FindTwinMentionsStrict() { for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.Count; sentNum++) { IList <Mention> golds = goldOrderedMentionsBySentence[sentNum]; IList <Mention> predicts = predictedOrderedMentionsBySentence[sentNum]; // For CoNLL training there are some documents with gold mentions with the same position offsets // See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll // (Packwood - Roth) CollectionValuedMap <IntPair, Mention> goldMentionPositions = new CollectionValuedMap <IntPair, Mention>(); foreach (Mention g in golds) { IntPair ip = new IntPair(g.startIndex, g.endIndex); if (goldMentionPositions.Contains(ip)) { StringBuilder existingMentions = new StringBuilder(); foreach (Mention eg in goldMentionPositions[ip]) { if (existingMentions.Length > 0) { existingMentions.Append(","); } existingMentions.Append(eg.mentionID); } SieveCoreferenceSystem.logger.Warning("WARNING: gold mentions with the same offsets: " + ip + " mentions=" + g.mentionID + "," + existingMentions + ", " + g.SpanToString()); } //assert(!goldMentionPositions.containsKey(ip)); goldMentionPositions.Add(new IntPair(g.startIndex, g.endIndex), g); } foreach (Mention p in predicts) { IntPair pos = new IntPair(p.startIndex, p.endIndex); if (goldMentionPositions.Contains(pos)) { ICollection <Mention> cm = goldMentionPositions[pos]; Mention g_1 = cm.GetEnumerator().Current; cm.Remove(g_1); p.mentionID = g_1.mentionID; p.twinless = false; g_1.twinless = false; } } // temp: for making easy to recognize twinless mention foreach (Mention p_1 in predicts) { if (p_1.twinless) { p_1.mentionID += 10000; } } } }
public virtual CollectionValuedMap <string, JollyDayHolidays.JollyHoliday> GetAllHolidaysCVMap(ICollection <Holiday> allHolidays) { CollectionValuedMap <string, JollyDayHolidays.JollyHoliday> map = new CollectionValuedMap <string, JollyDayHolidays.JollyHoliday>(); foreach (Holiday h in allHolidays) { string descKey = h.GetDescriptionPropertiesKey(); if (descKey != null) { descKey = descKey.ReplaceAll(".*\\.", string.Empty); JollyDayHolidays.JollyHoliday jh = new JollyDayHolidays.JollyHoliday(descKey, holidayManager, h); map.Add(jh.label, jh); } } return(map); }
private void ReadSRLFile(string srlFile) { srlMap = Generics.NewHashMap(); foreach (string line in ObjectBank.GetLineIterator(new File(srlFile))) { string[] bits = line.Split("\\s+", 3); string filename = bits[0]; int treeNum = System.Convert.ToInt32(bits[1]); string info = bits[2]; CollectionValuedMap <int, string> cvm = srlMap[filename]; if (cvm == null) { cvm = new CollectionValuedMap <int, string>(); srlMap[filename] = cvm; } cvm.Add(treeNum, info); } }
/// <summary> /// Given a CollectionValued Map of vectors, treats outer key as label for each /// set of inner vectors. /// </summary> /// <remarks> /// Given a CollectionValued Map of vectors, treats outer key as label for each /// set of inner vectors. /// NOTE: if l2NormalizeVectors is T, creates a copy of each vector and applies /// l2Normalize to it. /// </remarks> public virtual KNNClassifier <K, V> Train(CollectionValuedMap <K, ICounter <V> > vecBag) { KNNClassifier <K, V> classifier = new KNNClassifier <K, V>(k, weightedVotes, l2NormalizeVectors); ICollection <RVFDatum <K, V> > instances = new List <RVFDatum <K, V> >(); foreach (K label in vecBag.Keys) { RVFDatum <K, V> datum; foreach (ICounter <V> vector in vecBag[label]) { if (l2NormalizeVectors) { datum = new RVFDatum <K, V>(Counters.L2Normalize(new ClassicCounter <V>(vector)), label); } else { datum = new RVFDatum <K, V>(vector, label); } instances.Add(datum); } } classifier.AddInstances(instances); return(classifier); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
public virtual void ApplyPats(ICounter <E> patterns, string label, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords ) { // Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>(); // Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>(); // Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList(); foreach (KeyValuePair <string, Env> en in constVars.env) { en.Value.GetVariables().PutAll(ConstantsAndVariables.globalEnv.GetVariables()); } IDictionary <E, IDictionary <string, DataInstance> > sentencesForPatterns = GetSentences(constVars.invertedIndex.QueryIndex(patterns.KeySet())); foreach (KeyValuePair <E, IDictionary <string, DataInstance> > en_1 in sentencesForPatterns) { RunParallelApplyPats(en_1.Value, label, en_1.Key, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords); } Redwood.Log(Redwood.Dbg, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.Size()); }
/// <summary>Load a collection of parse trees from the file of given name.</summary> /// <remarks> /// Load a collection of parse trees from the file of given name. /// Each tree may optionally be encased in parens to allow for Penn /// Treebank style trees. /// This methods implements the <code>FileProcessor</code> interface. /// </remarks> /// <param name="file">file to load a tree from</param> public void ProcessFile(File file) { ITreeReader tr = null; // SRL stuff CollectionValuedMap <int, string> srlMap = null; if (this.srlMap != null) { // there must be a better way ... string filename = file.GetAbsolutePath(); foreach (string suffix in this.srlMap.Keys) { if (filename.EndsWith(suffix)) { srlMap = this.srlMap[suffix]; break; } } if (srlMap == null) { log.Info("could not find SRL entries for file: " + file); } } try { // maybe print file name to stdout to get some feedback // could throw an IO exception if can't open for reading tr = TreeReaderFactory().NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), Encoding()))); int sentIndex = 0; Tree pt; while ((pt = tr.ReadTree()) != null) { if (pt.Label() is IHasIndex) { // so we can trace where this tree came from IHasIndex hi = (IHasIndex)pt.Label(); hi.SetDocID(file.GetName()); hi.SetSentIndex(sentIndex); } if (srlMap == null) { parseTrees.Add(pt); } else { ICollection <string> srls = srlMap[sentIndex]; // pt.pennPrint(); // log.info(srls); parseTrees.Add(pt); if (srls.IsEmpty()) { } else { // parseTrees.add(pt); foreach (string srl in srls) { // Tree t = pt.deepCopy(); string[] bits = srl.Split("\\s+"); int verbIndex = System.Convert.ToInt32(bits[0]); string lemma = bits[2].Split("\\.")[0]; // Tree verb = Trees.getTerminal(t, verbIndex); Tree verb = Edu.Stanford.Nlp.Trees.Trees.GetTerminal(pt, verbIndex); // ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL); ((CoreLabel)verb.Label()).Set(typeof(CoreAnnotations.CoNLLPredicateAnnotation), true); for (int i = 4; i < bits.Length; i++) { string arg = bits[i]; string[] bits1; if (arg.IndexOf("ARGM") >= 0) { bits1 = arg.Split("-"); } else { bits1 = arg.Split("-"); } string locs = bits1[0]; string argType = bits1[1]; if (argType.Equals("rel")) { continue; } foreach (string loc in locs.Split("[*,]")) { bits1 = loc.Split(":"); int term = System.Convert.ToInt32(bits1[0]); int height = System.Convert.ToInt32(bits1[1]); // Tree t1 = Trees.getPreTerminal(t, term); Tree t1 = Edu.Stanford.Nlp.Trees.Trees.GetPreTerminal(pt, term); for (int j = 0; j < height; j++) { // t1 = t1.parent(t); t1 = t1.Parent(pt); } IDictionary <int, string> roleMap = ((CoreLabel)t1.Label()).Get(typeof(CoreAnnotations.CoNLLSRLAnnotation)); if (roleMap == null) { roleMap = Generics.NewHashMap(); ((CoreLabel)t1.Label()).Set(typeof(CoreAnnotations.CoNLLSRLAnnotation), roleMap); } roleMap[verbIndex] = argType; } } } } } // ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, SRL_ID.ARG); // for (Tree t1 : t) { // if (t1.isLeaf()) { continue; } // CoreLabel fl = (CoreLabel)t1.label(); // if (fl.value() == null) { continue; } // if (!fl.has(SRLIDAnnotation.class)) { // boolean allNone = true; // for (Tree t2 : t1) { // SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class); // if (s == SRL_ID.ARG || s == SRL_ID.REL) { // allNone = false; // break; // } // } // if (allNone) { // fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO); // } else { // fl.set(SRLIDAnnotation.class, SRL_ID.NO); // } // } // } // parseTrees.add(t); sentIndex++; } } catch (IOException e) { throw new RuntimeIOException("MemoryTreebank.processFile IOException in file " + file, e); } finally { IOUtils.CloseIgnoringExceptions(tr); } }
private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords) { Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences "); IList <string> notAllowedClasses = new List <string>(); IList <string> sentids = CollectionUtils.ToList(sents.Keys); if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass) { foreach (string l in constVars.GetAnswerClass().Keys) { if (!l.Equals(label)) { notAllowedClasses.Add(l); } } notAllowedClasses.Add("OTHERSEM"); } IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null; IDictionary <SemgrexPattern, E> depPatternsLearnedThisIterConverted = null; if (constVars.patternType.Equals(PatternFactory.PatternType.Surface)) { surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>(); string patternStr = null; try { patternStr = pattern.ToString(notAllowedClasses); TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr)); surfacePatternsLearnedThisIterConverted[pat] = pattern; } catch (Exception e) { log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer."); throw; } } else { if (constVars.patternType.Equals(PatternFactory.PatternType.Dep)) { depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>(); SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables())); depPatternsLearnedThisIterConverted[pat] = pattern; } else { throw new NotSupportedException(); } } //Apply the patterns and extract candidate phrases int num; int numThreads = constVars.numThreads; //If number of sentences is less, do not create so many threads if (sents.Count < 50) { numThreads = 1; } if (numThreads == 1) { num = sents.Count; } else { num = sents.Count / (numThreads - 1); } IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads); IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >(); for (int i = 0; i < numThreads; i++) { ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null; if (pattern.type.Equals(PatternFactory.PatternType.Surface)) { //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1)); task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } else { task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task); list.Add(submit); } // Now retrieve the result foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list) { try { Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get(); Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First()); wordsandLemmaPatExtracted.AddAll(result.First()); matchedTokensByPat.AddAll(result.Second()); Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third()); } catch (Exception e) { executor.ShutdownNow(); throw new Exception(e); } } executor.Shutdown(); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public virtual ICounter <CandidatePhrase> LearnNewPhrases(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, CollectionValuedMap <E, Triple <string, int, int> > tokensMatchedPatterns , ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E, CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords) { bool computeProcDataFreq = false; if (Data.processedDataFreq == null) { computeProcDataFreq = true; Data.processedDataFreq = new ClassicCounter <CandidatePhrase>(); System.Diagnostics.Debug.Assert(Data.rawFreq != null); } ICollection <CandidatePhrase> alreadyIdentifiedWords = new HashSet <CandidatePhrase>(constVars.GetLearnedWords(label).KeySet()); Sharpen.Collections.AddAll(alreadyIdentifiedWords, constVars.GetSeedLabelDictionary()[label]); ICounter <CandidatePhrase> words = LearnNewPhrasesPrivate(label, patternsForEachToken, patternsLearnedThisIter, allSelectedPatterns, alreadyIdentifiedWords, tokensMatchedPatterns, scoreForAllWordsThisIteration, terms, wordsPatExtracted, patternsAndWords4Label , identifier, ignoreWords, computeProcDataFreq); //constVars.addLabelDictionary(label, words.keySet()); return(words); }
//Here, the index (startIndex, endIndex) seems to be inclusive of the endIndex public virtual void PrintSubGraph(SemanticGraph g, IndexedWord w, IList <string> additionalCutOffRels, IList <string> textTokens, ICollection <string> listOfOutput, ICollection <IntPair> listOfOutputIndices, IList <IndexedWord> seenNodes, IList <IndexedWord > doNotAddThese, bool findSubTrees, ICollection <ExtractedPhrase> extractedPhrases, SemgrexPattern pattern, IPredicate <CoreLabel> acceptWord) { try { if (seenNodes.Contains(w)) { return; } seenNodes.Add(w); if (doNotAddThese.Contains(w)) { return; } IList <IndexedWord> andNodes = new List <IndexedWord>(); DescendantsWithReln(g, w, "conj_and", new List <IndexedWord>(), andNodes); //System.out.println("and nodes are " + andNodes); foreach (IndexedWord w1 in andNodes) { PrintSubGraph(g, w1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord); } Sharpen.Collections.AddAll(doNotAddThese, andNodes); IList <string> allCutOffRels = new List <string>(); if (additionalCutOffRels != null) { Sharpen.Collections.AddAll(allCutOffRels, additionalCutOffRels); } Sharpen.Collections.AddAll(allCutOffRels, cutoffRelations); CollectionValuedMap <int, string> featPerToken = new CollectionValuedMap <int, string>(); ICollection <string> feat = new List <string>(); GetPatternsFromDataMultiClass.GetFeatures(g, w, true, feat, null); ICollection <IndexedWord> words = Descendants(g, w, allCutOffRels, doNotAddThese, ignoreCommonTags, acceptWord, featPerToken); // words.addAll(andNodes); // if (includeSiblings == true) { // for (IndexedWord ws : g.getSiblings(w)) { // if (additionalCutOffNodes == null // || !additionalCutOffNodes.contains(g.reln(g.getParent(w), // ws).getShortName())) // words.addAll(descendants(g, ws, additionalCutOffNodes, doNotAddThese)); // } // } // if(afterand != null){ // Set<IndexedWord> wordsAnd = descendants(g,afterand, // additionalCutOffNodes); // words.removeAll(wordsAnd); // printSubGraph(g,afterand, includeSiblings, additionalCutOffNodes); // } //System.out.println("words are " + words); if (words.Count > 0) { int min = int.MaxValue; int max = -1; foreach (IndexedWord word in words) { if (word.Index() < min) { min = word.Index(); } if (word.Index() > max) { max = word.Index(); } } IntPair indices; // Map<Integer, String> ph = new TreeMap<Integer, String>(); // String phrase = ""; // for (IndexedWord word : words) { // ph.put(word.index(), word.value()); // } // phrase = StringUtils.join(ph.values(), " "); if ((max - min + 1) > maxPhraseLength) { max = min + maxPhraseLength - 1; } indices = new IntPair(min - 1, max - 1); string phrase = StringUtils.Join(textTokens.SubList(min - 1, max), " "); phrase = phrase.Trim(); feat.Add("LENGTH-" + (max - min + 1)); for (int i = min; i <= max; i++) { Sharpen.Collections.AddAll(feat, featPerToken[i]); } //System.out.println("phrase is " + phrase + " index is " + indices + " and maxphraselength is " + maxPhraseLength + " and descendentset is " + words); ExtractedPhrase extractedPh = new ExtractedPhrase(min - 1, max - 1, pattern, phrase, Counters.AsCounter(feat)); if (!listOfOutput.Contains(phrase) && !doNotAddThese.Contains(phrase)) { // if (sentElem != null) { // Element node = new Element(elemString, curNS); // node.addContent(phrase); // sentElem.addContent(node); // } listOfOutput.Add(phrase); if (!listOfOutputIndices.Contains(indices)) { listOfOutputIndices.Add(indices); extractedPhrases.Add(extractedPh); } if (findSubTrees == true) { foreach (IndexedWord word_1 in words) { if (!seenNodes.Contains(word_1)) { PrintSubGraph(g, word_1, additionalCutOffRels, textTokens, listOfOutput, listOfOutputIndices, seenNodes, doNotAddThese, findSubTrees, extractedPhrases, pattern, acceptWord); } } } } } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <exception cref="System.Exception"/> public static ICollection <IndexedWord> Descendants(SemanticGraph g, IndexedWord vertex, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, bool ignoreCommonTags, IPredicate <CoreLabel> acceptWord, CollectionValuedMap <int, string> feat) { // Do a depth first search ICollection <IndexedWord> descendantSet = new HashSet <IndexedWord>(); if (doNotAddThese != null && doNotAddThese.Contains(vertex)) { return(descendantSet); } if (!acceptWord.Test(vertex.BackingLabel())) { return(descendantSet); } DescendantsHelper(g, vertex, descendantSet, allCutOffRels, doNotAddThese, new List <IndexedWord>(), ignoreCommonTags, acceptWord, feat); // String descStr = ""; // for(IndexedWord descendant: descendantSet){ // descStr += descendant.word()+" "; // } // System.out.println(descStr); return(descendantSet); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
/// <exception cref="System.Exception"/> private static void DescendantsHelper(SemanticGraph g, IndexedWord curr, ICollection <IndexedWord> descendantSet, IList <string> allCutOffRels, IList <IndexedWord> doNotAddThese, IList <IndexedWord> seenNodes, bool ignoreCommonTags, IPredicate <CoreLabel > acceptWord, CollectionValuedMap <int, string> feat) { if (seenNodes.Contains(curr)) { return; } seenNodes.Add(curr); if (descendantSet.Contains(curr) || (doNotAddThese != null && doNotAddThese.Contains(curr)) || !acceptWord.Test(curr.BackingLabel())) { return; } if (!ignoreCommonTags || !ignoreTags.Contains(curr.Tag().Trim())) { descendantSet.Add(curr); } foreach (IndexedWord child in g.GetChildren(curr)) { bool dontuse = false; if (doNotAddThese != null && doNotAddThese.Contains(child)) { dontuse = true; } GrammaticalRelation rel = null; if (dontuse == false) { rel = g.Reln(curr, child); dontuse = CheckIfSatisfiesRelConstrains(g, curr, child, allCutOffRels, rel); } if (dontuse == false) { foreach (string cutOffTagRegex in cutoffTags) { if (child.Tag().Matches(cutOffTagRegex)) { if (Debug >= 5) { System.Console.Out.WriteLine("ignored tag " + child + " because it satisfied " + cutOffTagRegex); } dontuse = true; break; } } } if (dontuse == false) { if (!feat.Contains(curr.Index())) { feat[curr.Index()] = new List <string>(); } GetPatternsFromDataMultiClass.GetFeatures(g, curr, false, feat[curr.Index()], rel); //feat.add(curr.index(), "REL-" + rel.getShortName()); DescendantsHelper(g, child, descendantSet, allCutOffRels, doNotAddThese, seenNodes, ignoreCommonTags, acceptWord, feat); } } }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { DataInstance sent = sents[sentid]; IList <CoreLabel> tokens = sent.GetTokens(); foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } SemanticGraph graph = ((DataInstanceDep)sent).GetGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label); foreach (ExtractedPhrase match in matched) { int s = match.startIndex; int e = match.endIndex + 1; string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label]) { s = i; } else { //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); break; } } for (int i_1 = e; i_1 < tokens.Count; i_1++) { if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label]) { e = i_1; } else { //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = tokens[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } Pattern pSur = pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0); } } } } } return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); TwoDimensionalCounter <Pair <string, string>, E> allFreq = new TwoDimensionalCounter <Pair <string, string>, E>(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); //FIND_ALL is faster than FIND_NONOVERLAP IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll); foreach (ISequenceMatchResult <ICoreMap> m in matched) { int s = m.Start("$term"); int e = m.End("$term"); E matchedPat = patterns[m.Pattern()]; matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e)); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // unneeded as done on initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns))) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString())) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0); } } } // for (SurfacePattern pat : patterns.keySet()) { // String patternStr = pat.toString(); // // TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr); // if (pat == null || p == null) // throw new RuntimeException("why is the pattern " + pat + " null?"); // // TokenSequenceMatcher m = p.getMatcher(sent); // while (m.find()) { // // int s = m.start("$term"); // int e = m.end("$term"); // // String phrase = ""; // String phraseLemma = ""; // boolean useWordNotLabeled = false; // boolean doNotUse = false; // for (int i = s; i < e; i++) { // CoreLabel l = sent.get(i); // l.set(PatternsAnnotations.MatchedPattern.class, true); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } // for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) { // if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) { // doNotUse = true; // } // } // boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords); // if (removePhrasesWithStopWords && containsStop) { // doNotUse = true; // } else { // if (!containsStop || !removeStopWordsFromSelectedPhrases) { // // if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) { // useWordNotLabeled = true; // } // phrase += " " + l.word(); // phraseLemma += " " + l.lemma(); // // } // } // } // if (!doNotUse && useWordNotLabeled) { // phrase = phrase.trim(); // phraseLemma = phraseLemma.trim(); // allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0); // } // } // } return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }