/// <summary>Converts a tree to the Morfette training format.</summary> private static string TreeToMorfette(Tree tree) { StringBuilder sb = new StringBuilder(); IList <ILabel> yield = tree.Yield(); IList <ILabel> tagYield = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == tagYield.Count); int listLen = yield.Count; for (int i = 0; i < listLen; ++i) { CoreLabel token = (CoreLabel)yield[i]; CoreLabel tag = (CoreLabel)tagYield[i]; string morphStr = token.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = tag.Value(); } string lemma = token.Lemma(); if (lemma == null || lemma.Equals(string.Empty)) { lemma = token.Value(); } sb.Append(string.Format("%s %s %s%n", token.Value(), lemma, morphStr)); } return(sb.ToString()); }
private static bool ContainsStopWord(CoreLabel l, ICollection <string> commonEngWords, Pattern ignoreWordRegex) { // if(useWordResultCache.containsKey(l.word())) // return useWordResultCache.get(l.word()); if ((commonEngWords != null && (commonEngWords.Contains(l.Lemma()) || commonEngWords.Contains(l.Word()))) || (ignoreWordRegex != null && ignoreWordRegex.Matcher(l.Lemma()).Matches())) { //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) { // useWordResultCache.putIfAbsent(l.word(), false); return(true); } // // if (l.word().length() >= minLen4Fuzzy) { // try { // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords, // l.word(), minLen4Fuzzy); // if (matchedFuzzy != null) { // synchronized (commonEngWords) { // commonEngWords.add(l.word()); // System.out.println("word is " + l.word() + " and matched fuzzy with " + // matchedFuzzy); // } // useWordResultCache.putIfAbsent(l.word(), false); // return false; // } // } catch (Exception e) { // e.printStackTrace(); // System.out.println("Exception " + " while fuzzy matching " + l.word()); // } // } // useWordResultCache.putIfAbsent(l.word(), true); return(false); }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; string morfetteFile = args[1]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile); for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();) { IList <CoreLabel> analysis = morfetteItr.Current; IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(analysis.Count == yield.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel tokenAnalysis = analysis[i]; ILabel token = yield[i]; string lemma = GetLemma(token.Value(), tokenAnalysis.Lemma()); string newLeaf = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag()); ((CoreLabel)token).SetValue(newLeaf); } System.Console.Out.WriteLine(tree.ToString()); } if (tr.ReadTree() != null || morfetteItr.MoveNext()) { log.Info("WARNING: Uneven input files!"); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public virtual void TestCoreLabelSetWordBehavior() { CoreLabel foo = new CoreLabel(); foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo"); foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B"); foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); // Lemma gets removed with word ArrayCoreMap copy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(copy, foo); foo.SetWord("foo"); NUnit.Framework.Assert.AreEqual(copy, foo); // same word set foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // lemma removed foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // still removed foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); NUnit.Framework.Assert.AreEqual(copy, foo); // back to normal // Hash code is consistent int hashCode = foo.GetHashCode(); NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode); foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); // Hash code doesn't care between a value of null and the key not existing NUnit.Framework.Assert.IsTrue(foo.Lemma() == null); int lemmalessHashCode = foo.GetHashCode(); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma(null); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma("fool"); NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode()); // Check equals foo.SetWord("bar"); foo.SetWord("foo"); ArrayCoreMap nulledCopy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); }
/// <exception cref="System.IO.IOException"/> public virtual void HandleLemma(string arg, OutputStream outStream) { if (arg == null) { return; } IList <CoreLabel> tokens = parser.Lemmatize(arg); OutputStreamWriter osw = new OutputStreamWriter(outStream, "utf-8"); for (int i = 0; i < tokens.Count; ++i) { CoreLabel word = tokens[i]; if (i > 0) { osw.Write(" "); } osw.Write(word.Lemma()); } osw.Write("\n"); osw.Flush(); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
private static bool LemmaExists(CoreLabel l) { return(l.Lemma() != null && !l.Lemma().IsEmpty()); }
/* * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq, TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted, * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{ * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>(); * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>(); * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList(); * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an"); * * for(Entry<Integer, Double> en: patterns.entrySet()){ * Integer pindex = en.getKey(); * SurfacePattern p = constVars.getPatternIndex().get(pindex); * String[] n = p.getSimplerTokensNext(); * String[] pr = p.getSimplerTokensPrev(); * boolean rest = false; * if(n!=null){ * for(String e: n){ * if(!specialWords.contains(e)){ * rest = true; * break; * } * } * } * if(rest == false && pr!=null){ * for(String e: pr){ * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){ * rest = true; * break; * } * } * } * if(rest) * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue()); * else * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue()); * } * * * * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex()); * * if (constVars.batchProcessSents) { * List<File> filesToLoad; * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0) * filesToLoad = Data.sentsFiles; * else{ * filesToLoad = new ArrayList<File>(); * for (String fname : sentidswithfilerest.keySet()) { * String filename; * // if(!constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname; * // else * filename = fname; * filesToLoad.add(new File(filename)); * } * } * * for (File fname : filesToLoad) { * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname); * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname); * * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){ * * String filename; * // if(constVars.usingDirForSentsInIndex) * // filename = constVars.saveSentencesSerDir+"/"+fname.getName(); * // else * filename = fname.getAbsolutePath(); * * Set<String> sentIDs = sentidswithfilerest.get(filename); * if (sentIDs != null){ * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * Redwood.log(Redwood.DBG, "No sentIds for " + filename + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * * if (computeDataFreq){ * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(fname.getName()); * } * } * * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error. * if(computeDataFreq){ * for(File f: Data.sentsFiles){ * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){ * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f); * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound); * Data.fileNamesUsedToComputeRawFreq.add(f.getName()); * } * } * } * * } else { * * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) { * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0); * Set<String> sentids = sentidswithfilerest.get(filename); * if (sentids != null) { * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat); * } else * throw new RuntimeException("How come no sentIds for " + filename + ". Index keyset is " + constVars.invertedIndex.getKeySet()); * } * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){ * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat); * } * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound); * } * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size()); * } */ private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted) { foreach (KeyValuePair <string, DataInstance> sentEn in sents) { IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key); if (pat4Sent == null) { throw new Exception("How come there are no patterns for " + sentEn.Key); } foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent) { CoreLabel token = null; ICollection <E> p1 = en.Value; // Set<Integer> p1 = en.getValue().first(); // Set<Integer> p2 = en.getValue().second(); // Set<Integer> p3 = en.getValue().third(); foreach (E index in patternsLearnedThisIter.KeySet()) { if (p1.Contains(index)) { if (token == null) { token = sentEn.Value.GetTokens()[en.Key]; } wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index); } } } } }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
/// <seealso cref="ConjugateEnglish(string, bool)"/> public string ConjugateEnglish(CoreLabel token) { return(ConjugateEnglish(Optional.OfNullable(token.Lemma()).OrElse(token.Word()), false)); }
//goldList null if not training public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining) { Annotation doc = sd.doc; sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc); GeneralDataset <string, string> dataset = new RVFDataset <string, string>(); //necessary for 'ScoreBestMention' IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >(); //maps quote to corresponding indices in the dataset IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>(); if (isTraining && goldList.Count != quotes.Count) { throw new Exception("Gold Quote List size doesn't match quote list size!"); } for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++) { int initialSize = dataset.Size(); ICoreMap quote = quotes[quoteIdx]; XMLToAnnotation.GoldQuoteInfo gold = null; if (isTraining) { gold = goldList[quoteIdx]; if (gold.speaker == string.Empty) { continue; } } ICoreMap quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; Pair <int, int> quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation))); // int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class); int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)); //add mentions before quote up to the previous paragraph int rightValue = quoteRun.first - 1; int leftValue = quoteRun.first - 1; //move left value to be the first token idx of the previous paragraph for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--) { ICoreMap sentence = sentences[sentIdx]; if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { continue; } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { //quoteParagraphIdx - 1 for this and prev leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); } else { break; } } IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>(); if (leftValue > -1 && rightValue > -1) { mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue))); } //mentions in next paragraph leftValue = quoteRun.second + 1; rightValue = quoteRun.second + 1; for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++) { ICoreMap sentence = sentences[sentIdx_1]; // if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) { // continue; // } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { //quoteParagraphIdx + 1 rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1; } else { break; } } IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>(); if (leftValue < tokens.Count && rightValue < tokens.Count) { mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue)); } IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>(); Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph); Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph); // System.out.println(candidateMentions.size()); int rankedDistance = 1; int numBackwards = mentionsInPreviousParagraph.Count; foreach (Sieve.MentionData mention in candidateMentions) { IList <CoreLabel> mentionCandidateTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1); ICoreMap mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()]; // if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) { // continue; // } ICounter <string> features = new ClassicCounter <string>(); bool isLeft = true; int distance = quoteRun.first - mention.end; if (distance < 0) { isLeft = false; distance = mention.begin - quoteRun.second; } if (distance < 0) { continue; } //disregard mention-in-quote cases. features.SetCount("wordDistance", distance); IList <CoreLabel> betweenTokens; if (isLeft) { betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first); } else { betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin); } //Punctuation in between foreach (CoreLabel token in betweenTokens) { if (punctuation.Contains(token.Word())) { features.SetCount("punctuationPresence:" + token.Word(), 1); } } // number of mentions away features.SetCount("rankedDistance", rankedDistance); rankedDistance++; if (rankedDistance == numBackwards) { //reset for the forward rankedDistance = 1; } // int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); //third distance: # of paragraphs away int mentionParagraphIdx = -1; ICoreMap sentenceInMentionParagraph = null; int quoteParagraphBeginToken = GetParagraphBeginToken(quoteFirstSentence, sentences); int quoteParagraphEndToken = GetParagraphEndToken(quoteFirstSentence, sentences); if (isLeft) { if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("leftParagraphDistance", 0); mentionParagraphIdx = quoteParagraphIdx; sentenceInMentionParagraph = quoteFirstSentence; } else { int paragraphDistance = 1; int currParagraphIdx = quoteParagraphIdx - paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currParagraphIdx >= 0) { // Paragraph prevParagraph = paragraphs.get(prevParagraphIndex); //extract begin and end tokens of while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx) { currSentenceIdx--; currSentence = sentences[currSentenceIdx]; } int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int prevParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd) { mentionParagraphIdx = currParagraphIdx; sentenceInMentionParagraph = currSentence; features.SetCount("leftParagraphDistance", paragraphDistance); if (paragraphDistance % 2 == 0) { features.SetCount("leftParagraphDistanceEven", 1); } break; } paragraphDistance++; currParagraphIdx--; } } } else { //right if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("rightParagraphDistance", 0); sentenceInMentionParagraph = quoteFirstSentence; mentionParagraphIdx = quoteParagraphIdx; } else { int paragraphDistance = 1; int nextParagraphIndex = quoteParagraphIdx + paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currSentenceIdx < sentences.Count) { while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex) { currSentenceIdx++; currSentence = sentences[currSentenceIdx]; } int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int nextParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd) { sentenceInMentionParagraph = currSentence; features.SetCount("rightParagraphDistance", paragraphDistance); break; } paragraphDistance++; nextParagraphIndex++; } } } //2. mention features if (sentenceInMentionParagraph != null) { int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences); int mentionParagraphEnd = GetParagraphEndToken(sentenceInMentionParagraph, sentences); if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken)) { IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>()); Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd)); features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count); features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1); features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count); //mention ordering in paragraph it is in for (int i = 0; i < namesInMentionParagraph.second.Count; i++) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i])) { features.SetCount("orderInParagraph", i); } } //if mention paragraph is all one quote if (quotesInMentionParagraph.Count == 1) { ICoreMap qInMentionParagraph = quotesInMentionParagraph[0]; if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd) { features.SetCount("mentionParagraphIsInConversation", 1); } else { features.SetCount("mentionParagraphIsInConversation", -1); } } foreach (ICoreMap quoteIMP in quotesInMentionParagraph) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end))) { features.SetCount("mentionInQuote", 1); } } if (features.GetCount("mentionInQuote") != 1) { features.SetCount("mentionNotInQuote", 1); } } } // nearby word syntax types...make sure to check if there are previous or next words // or there will be an array index crash if (mention.begin > 0) { CoreLabel prevWord = tokens[mention.begin - 1]; features.SetCount("prevWordType:" + prevWord.Tag(), 1); if (punctuationForFeatures.Contains(prevWord.Lemma())) { features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1); } } if (mention.end + 1 < tokens.Count) { CoreLabel nextWord = tokens[mention.end + 1]; features.SetCount("nextWordType:" + nextWord.Tag(), 1); if (punctuationForFeatures.Contains(nextWord.Lemma())) { features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1); } } // features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1); //quote paragraph features IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx]; features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count); features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1); features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count); //quote features features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1); for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++) { if (quotesInQuoteParagraph[i_1].Equals(quote)) { features.SetCount("quotePosition", i_1 + 1); } } if (features.GetCount("quotePosition") == 0) { throw new Exception("Check this (equality not working)"); } Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun); foreach (string name in namesData.first) { features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1); } //if quote encompasses entire paragraph if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken) { features.SetCount("isImplicitSpeaker", 1); } else { features.SetCount("isImplicitSpeaker", -1); } //Vocative detection if (mention.type.Equals("name")) { IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))]; Person p = null; if (pList != null) { p = pList[0]; } else { Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end)); if (scanForNamesResultPair.first.Count != 0) { string scanForNamesResultString = scanForNamesResultPair.first[0]; if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString)) { p = sd.characterMap[scanForNamesResultString][0]; } } } if (p != null) { foreach (string name_1 in namesData.first) { if (p.aliases.Contains(name_1)) { features.SetCount("nameInQuote", 1); } } if (quoteParagraphIdx > 0) { // Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1); IList <ICoreMap> quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>()); IList <Pair <int, int> > exclusionList = new List <Pair <int, int> >(); foreach (ICoreMap quoteIPP in quotesInPrevParagraph) { Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation))); exclusionList.Add(quoteRange); foreach (string name_2 in sieve.ScanForNames(quoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphQuote", 1); } } } int sentenceIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sentenceInPrevParagraph = null; for (int i = sentenceIdx - 1; i_1 >= 0; i_1--) { ICoreMap currSentence = sentences[i_1]; if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { sentenceInPrevParagraph = currSentence; break; } } int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences); int prevParagraphEnd = GetParagraphEndToken(sentenceInPrevParagraph, sentences); IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList); foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns) { foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphNonQuote", 1); } } } } } } if (isTraining) { if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end))) { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention"); datum.SetID(int.ToString(dataset.Size())); dataset.Add(datum); mapDatumToMention[dataset.Size()] = mention; } } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } } mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1); } return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset)); }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { DataInstance sent = sents[sentid]; IList <CoreLabel> tokens = sent.GetTokens(); foreach (KeyValuePair <SemgrexPattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } SemanticGraph graph = ((DataInstanceDep)sent).GetGraph(); //SemgrexMatcher m = pEn.getKey().matcher(graph); //TokenSequenceMatcher m = pEn.getKey().matcher(sent); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory //m.setBranchLimit(5); ICollection <ExtractedPhrase> matched = GetMatchedTokensIndex(graph, pEn.Key, sent, label); foreach (ExtractedPhrase match in matched) { int s = match.startIndex; int e = match.endIndex + 1; string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (tokens[i].Get(constVars.GetAnswerClass()[label]).Equals(label) && (e - i + 1) <= PatternFactory.numWordsCompoundMapped[label]) { s = i; } else { //System.out.println("for phrase " + match + " clubbing earlier word. new s is " + s); break; } } for (int i_1 = e; i_1 < tokens.Count; i_1++) { if (tokens[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label) && (i_1 - s + 1) <= PatternFactory.numWordsCompoundMapped[label]) { e = i_1; } else { //System.out.println("for phrase " + match + " clubbing next word. new e is " + e); break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // get for free on array initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = tokens[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } Pattern pSur = pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); if (useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(CandidatePhrase.CreateOrGet(phrase, phraseLemma, match.GetFeatures()), pEn.Value, 1.0); } } } } } return(new Pair <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }
/// <exception cref="System.Exception"/> public virtual Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > > Call() { //CollectionValuedMap<String, Integer> tokensMatchedPattern = new CollectionValuedMap<String, Integer>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); TwoDimensionalCounter <Pair <string, string>, E> allFreq = new TwoDimensionalCounter <Pair <string, string>, E>(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); //FIND_ALL is faster than FIND_NONOVERLAP IEnumerable <ISequenceMatchResult <ICoreMap> > matched = multiPatternMatcher.Find(sent, SequenceMatcher.FindType.FindAll); foreach (ISequenceMatchResult <ICoreMap> m in matched) { int s = m.Start("$term"); int e = m.End("$term"); E matchedPat = patterns[m.Pattern()]; matchedTokensByPat.Add(matchedPat, new Triple <string, int, int>(sentid, s, e)); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // unneeded as done on initialization for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns))) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(matchedPat); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label.ToString())) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse && useWordNotLabeled) { phrase = phrase.Trim(); phraseLemma = phraseLemma.Trim(); allFreq.IncrementCount(new Pair <string, string>(phrase, phraseLemma), matchedPat, 1.0); } } } // for (SurfacePattern pat : patterns.keySet()) { // String patternStr = pat.toString(); // // TokenSequencePattern p = TokenSequencePattern.compile(constVars.env.get(label), patternStr); // if (pat == null || p == null) // throw new RuntimeException("why is the pattern " + pat + " null?"); // // TokenSequenceMatcher m = p.getMatcher(sent); // while (m.find()) { // // int s = m.start("$term"); // int e = m.end("$term"); // // String phrase = ""; // String phraseLemma = ""; // boolean useWordNotLabeled = false; // boolean doNotUse = false; // for (int i = s; i < e; i++) { // CoreLabel l = sent.get(i); // l.set(PatternsAnnotations.MatchedPattern.class, true); // if (restrictToMatched) { // tokensMatchedPattern.add(sentid, i); // } // for (Entry<Class, Object> ig : constVars.ignoreWordswithClassesDuringSelection.get(label).entrySet()) { // if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) { // doNotUse = true; // } // } // boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), constVars.ignoreWordRegex, ignoreWords); // if (removePhrasesWithStopWords && containsStop) { // doNotUse = true; // } else { // if (!containsStop || !removeStopWordsFromSelectedPhrases) { // // if (label == null || l.get(constVars.answerClass.get(label)) == null || !l.get(constVars.answerClass.get(label)).equals(label.toString())) { // useWordNotLabeled = true; // } // phrase += " " + l.word(); // phraseLemma += " " + l.lemma(); // // } // } // } // if (!doNotUse && useWordNotLabeled) { // phrase = phrase.trim(); // phraseLemma = phraseLemma.trim(); // allFreq.incrementCount(new Pair<String, String>(phrase, phraseLemma), pat, 1.0); // } // } // } return(new Pair <TwoDimensionalCounter <Pair <string, string>, E>, CollectionValuedMap <E, Triple <string, int, int> > >(allFreq, matchedTokensByPat)); }
public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords) { ICollection <SurfacePattern> prevpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> nextpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>(); CoreLabel token = sent[i]; string tag = null; if (usePOS4Pattern) { string fulltag = token.Tag(); if (useCoarsePOS) { tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2)); } else { tag = fulltag; } } string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) { IList <Token> previousTokens = new List <Token>(); IList <string> originalPrev = new List <string>(); IList <string> originalNext = new List <string>(); IList <Token> nextTokens = new List <Token>(); int numStopWordsprev = 0; int numStopWordsnext = 0; // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0; int numNonStopWordsNext = 0; int numNonStopWordsPrev = 0; bool useprev = false; bool usenext = false; PatternToken twithoutPOS = null; //TODO: right now using numWordsCompoundMax. if (addPatWithoutPOS) { twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } PatternToken twithPOS = null; if (usePOS4Pattern) { twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } if (usePreviousContext) { // int j = Math.max(0, i - 1); int j = i - 1; int numTokens = 0; while (numTokens < maxWin && j >= 0) { // for (int j = Math.max(i - maxWin, 0); j < i; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j--; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException("how come the class " // + answerClass.get(label) + " for token " // + tokenj.word() + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; if (!isLabeledO) { // numPrevTokensSpecial++; previousTokens.Add(0, strgeneric); // previousTokens.add(0, // "[{answer:" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalPrev.Add(0, strOriginal); numNonStopWordsPrev++; } else { if (tokenj.Word().StartsWith("http")) { useprev = false; previousTokens.Clear(); originalPrev.Clear(); break; } else { Token str = SurfacePattern.GetContextToken(tokenj); previousTokens.Add(0, str); originalPrev.Add(0, tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsprev++; } else { numNonStopWordsPrev++; } } } numTokens++; j--; } } if (useNextContext) { int numTokens = 0; int j = i + 1; while (numTokens < maxWin && j < sent.Count) { // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j++; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException( // "how come the dict annotation for token " + tokenj.word() // + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; // boolean isLabeledO = tokenj.get(answerClass.get(label)) // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); if (!isLabeledO) { // numNextTokensSpecial++; numNonStopWordsNext++; nextTokens.Add(strgeneric); // nextTokens.add("[{" + label + ":" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalNext.Add(strOriginal); } else { // originalNextStr += " " // + tokenj.get(answerClass.get(label)).toString(); if (tokenj.Word().StartsWith("http")) { usenext = false; nextTokens.Clear(); originalNext.Clear(); break; } else { // if (!tokenj.word().matches("[.,?()]")) { Token str = SurfacePattern.GetContextToken(tokenj); nextTokens.Add(str); originalNext.Add(tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsnext++; } else { numNonStopWordsNext++; } } } j++; numTokens++; } } // String prevContext = null, nextContext = null; // int numNonSpecialPrevTokens = previousTokens.size() // - numPrevTokensSpecial; // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial; Token[] prevContext = null; //String[] prevContext = null; //String[] prevOriginalArr = null; // if (previousTokens.size() >= minWindow4Pattern // && (numStopWordsprev < numNonSpecialPrevTokens || // numNonSpecialPrevTokens > numMinStopWordsToAdd)) { if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) { // prevContext = StringUtils.join(previousTokens, fw); IList <Token> prevContextList = new List <Token>(); IList <string> prevOriginal = new List <string>(); foreach (Token p in previousTokens) { prevContextList.Add(p); if (!fw.IsEmpty()) { prevContextList.Add(fw); } } // add fw and sw to the the originalprev foreach (string p_1 in originalPrev) { prevOriginal.Add(p_1); if (!fw.IsEmpty()) { prevOriginal.Add(" FW "); } } if (!sw.IsEmpty()) { prevContextList.Add(sw); prevOriginal.Add(" SW "); } // String str = prevContext + fw + sw; if (IsASCII(StringUtils.Join(prevOriginal))) { prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]); //prevOriginalArr = prevOriginal.toArray(new String[0]); if (previousTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(patPOS); } } useprev = true; } } Token[] nextContext = null; //String [] nextOriginalArr = null; // if (nextTokens.size() > 0 // && (numStopWordsnext < numNonSpecialNextTokens || // numNonSpecialNextTokens > numMinStopWordsToAdd)) { if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) { // nextContext = StringUtils.join(nextTokens, fw); IList <Token> nextContextList = new List <Token>(); IList <string> nextOriginal = new List <string>(); if (!sw.IsEmpty()) { nextContextList.Add(sw); nextOriginal.Add(" SW "); } foreach (Token n in nextTokens) { if (!fw.IsEmpty()) { nextContextList.Add(fw); } nextContextList.Add(n); } foreach (string n_1 in originalNext) { if (!fw.IsEmpty()) { nextOriginal.Add(" FW "); } nextOriginal.Add(n_1); } if (nextTokens.Count >= minWindow4Pattern) { nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]); //nextOriginalArr = nextOriginal.toArray(new String[0]); if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(patPOS); } } usenext = true; } if (useprev && usenext) { // String strprev = prevContext + fw + sw; // String strnext = sw + fw + nextContext; if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(patPOS); } } } } // Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // prevpatterns, nextpatterns, prevnextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prev patterns are " + prevpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " next patterns are " + nextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prevnext patterns are " + prevnextpatterns); //getPatternIndex().finishCommit(); return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns)); }