Beispiel #1
0
 public static void RemoveKeys <E, F>(TwoDimensionalCounter <E, F> counter, ICollection <E> removeKeysCollection)
 {
     foreach (E key in removeKeysCollection)
     {
         counter.Remove(key);
     }
 }
 private void LoadMWMap(string filename)
 {
     mwCounter = new TwoDimensionalCounter <string, string>();
     try
     {
         BufferedReader br     = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filename)), "UTF-8"));
         int            nLines = 0;
         for (string line; (line = br.ReadLine()) != null; nLines++)
         {
             string[] toks = line.Split("\t");
             System.Diagnostics.Debug.Assert(toks.Length == 3);
             mwCounter.SetCount(toks[0].Trim(), toks[1].Trim(), double.Parse(toks[2].Trim()));
         }
         br.Close();
         System.Console.Error.Printf("%s: Loaded %d lines from %s into MWE counter%n", this.GetType().FullName, nLines, filename);
     }
     catch (UnsupportedEncodingException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (FileNotFoundException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Beispiel #3
0
        /// <summary>
        /// Attempt to infer the part of speech of the given preterminal node, which
        /// was created during the expansion of a multi-word token.
        /// </summary>
        private static string InferPOS(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger)
        {
            string word = t.FirstChild().Value();
            string containingPhraseStr = GetContainingPhrase(t, parent);
            // Overrides: let the manual POS model handle a few special cases first
            string overrideTag = MultiWordPreprocessor.ManualUWModel.GetOverrideTag(word, containingPhraseStr);

            if (overrideTag != null)
            {
                return(overrideTag);
            }
            ICollection <string> unigramTaggerKeys = unigramTagger.FirstKeySet();

            // Try treating this word as a verb and stripping any clitic
            // pronouns. If the stripped version exists in the unigram
            // tagger, then stick with the verb hypothesis
            SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.SeparatePronouns(word);
            if (strippedVerb != null && unigramTaggerKeys.Contains(strippedVerb.GetStem()))
            {
                string pos = Counters.Argmax(unigramTagger.GetCounter(strippedVerb.GetStem()));
                if (pos.StartsWith("v"))
                {
                    return(pos);
                }
            }
            if (unigramTagger.FirstKeySet().Contains(word))
            {
                return(Counters.Argmax(unigramTagger.GetCounter(word), new MultiWordPreprocessor.POSTieBreaker()));
            }
            return(MultiWordPreprocessor.ManualUWModel.GetTag(word, containingPhraseStr));
        }
Beispiel #4
0
        private double NumNonRedundantPatterns(TwoDimensionalCounter <CandidatePhrase, E> terms, CandidatePhrase w)
        {
            object[] pats   = Sharpen.Collections.ToArray(terms.GetCounter(w).KeySet());
            int      numPat = 0;

            for (int i = 0; i < pats.Length; i++)
            {
                //String pati = constVars.getPatternIndex().get(pats[i]).toString();
                string pati     = pats[i].ToString();
                bool   contains = false;
                for (int j = i + 1; j < pats.Length; j++)
                {
                    //String patj = constVars.getPatternIndex().get(pats[j]).toString();
                    string patj = pats[j].ToString();
                    if (patj.Contains(pati) || pati.Contains(patj))
                    {
                        contains = true;
                        break;
                    }
                }
                if (!contains)
                {
                    numPat++;
                }
            }
            return(numPat);
        }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                log.Info(usage);
                System.Environment.Exit(1);
            }
            Properties options    = StringUtils.ArgsToProperties(args, argOptionDefs);
            string     outputPath = options.GetProperty("o");

            if (outputPath == null)
            {
                throw new ArgumentException("-o argument (output path for built tagger) is required");
            }
            string[]     remainingArgs = options.GetProperty(string.Empty).Split(" ");
            IList <File> fileList      = new List <File>();

            foreach (string arg in remainingArgs)
            {
                fileList.Add(new File(arg));
            }
            Edu.Stanford.Nlp.International.Spanish.Pipeline.AnCoraPOSStats stats = new Edu.Stanford.Nlp.International.Spanish.Pipeline.AnCoraPOSStats(fileList, outputPath);
            stats.Process();
            ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(outputPath));
            TwoDimensionalCounter <string, string> tagger = stats.GetUnigramTagger();

            oos.WriteObject(tagger);
            System.Console.Out.Printf("Wrote tagger to %s%n", outputPath);
        }
Beispiel #6
0
 public static void TraverseAndFix(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER)
 {
     if (t.IsPreTerminal())
     {
         if (t.Value().Equals(SpanishTreeNormalizer.MwTag))
         {
             nMissingPOS++;
             string pos = InferPOS(t, parent, unigramTagger);
             if (pos != null)
             {
                 t.SetValue(pos);
                 nFixedPOS++;
             }
         }
         return;
     }
     foreach (Tree kid in t.Children())
     {
         TraverseAndFix(kid, t, unigramTagger, retainNER);
     }
     // Post-order visit
     if (t.Value().StartsWith(SpanishTreeNormalizer.MwPhraseTag))
     {
         nMissingPhrasal++;
         string phrasalCat = InferPhrasalCategory(t, retainNER);
         if (phrasalCat != null)
         {
             t.SetValue(phrasalCat);
             nFixedPhrasal++;
         }
     }
 }
        /// <summary>Processes a single file containing AnCora XML trees.</summary>
        /// <remarks>
        /// Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in
        /// the file and the actual parsed trees.
        /// </remarks>
        private static Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ProcessTreeFile(File file, SpanishXMLTreeReaderFactory trf, string encoding)
        {
            TwoDimensionalCounter <string, string> tagger = new TwoDimensionalCounter <string, string>();

            try
            {
                Reader       @in   = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
                ITreeReader  tr    = trf.NewTreeReader(file.GetPath(), @in);
                IList <Tree> trees = new List <Tree>();
                Tree         t;
                Tree         splitPoint;
                while ((t = tr.ReadTree()) != null)
                {
                    do
                    {
                        // We may need to split the current tree into multiple parts.
                        // (If not, a call to `split` with a `null` split-point is a
                        // no-op
                        splitPoint = FindSplitPoint(t);
                        Pair <Tree, Tree> split = Split(t, splitPoint);
                        Tree toAdd = split.First();
                        t = split.Second();
                        trees.Add(toAdd);
                        UpdateTagger(tagger, toAdd);
                    }while (splitPoint != null);
                }
                tr.Close();
                return(new Pair <TwoDimensionalCounter <string, string>, IList <Tree> >(tagger, trees));
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                return(null);
            }
        }
Beispiel #8
0
 private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger)
 {
     try
     {
         BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
         ITreeReaderFactory trf = new FrenchTreeReaderFactory();
         ITreeReader        tr  = trf.NewTreeReader(br);
         PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
         int nTrees             = 0;
         for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
         {
             TraverseAndFix(t, pretermLabel, unigramTagger);
             pw.Println(t.ToString());
         }
         pw.Close();
         tr.Close();
         System.Console.Out.WriteLine("Processed " + nTrees + " trees");
     }
     catch (UnsupportedEncodingException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (FileNotFoundException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Beispiel #9
0
 public ScorePatternsF1(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                        , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, Properties props, ICounter <CandidatePhrase> p0Set, E p0)
     : base(constVars, patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props)
 {
     this.p0    = p0;
     this.p0Set = p0Set;
 }
 public ScorePatternsRatioModifiedFreq(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                                       , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, TwoDimensionalCounter <CandidatePhrase, ConstantsAndVariables.ScorePhraseMeasures> phInPatScores
                                       , ScorePhrases scorePhrases, Properties props)
     : base(constVars, patternScoring, label, allCandidatePhrases, patternsandWords4Label, negPatternsandWords4Label, unLabeledPatternsandWords4Label, props)
 {
     this.phInPatScores = phInPatScores;
     this.scorePhrases  = scorePhrases;
 }
 protected virtual void SetUp()
 {
     c = new TwoDimensionalCounter <string, string>();
     c.SetCount("a", "a", 1.0);
     c.SetCount("a", "b", 2.0);
     c.SetCount("a", "c", 3.0);
     c.SetCount("b", "a", 4.0);
     c.SetCount("b", "b", 5.0);
     c.SetCount("c", "a", 6.0);
 }
Beispiel #12
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName);
                System.Environment.Exit(-1);
            }
            File treeFile = new File(args[0]);
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new FrenchTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}");
                PrintCounter(labelTerm, "label_term.csv");
                System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}");
                PrintCounter(termLabel, "term_label.csv");
                System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}");
                PrintCounter(labelPreterm, "label_pos.csv");
                System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}");
                PrintCounter(pretermLabel, "pos_label.csv");
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, pretermLabel, unigramTagger);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS);
                System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal);
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        private static void UpdateTagger(TwoDimensionalCounter <string, string> tagger, Tree t)
        {
            IList <CoreLabel> yield = t.TaggedLabeledYield();

            foreach (CoreLabel label in yield)
            {
                if (label.Tag().Equals(SpanishTreeNormalizer.MwTag))
                {
                    continue;
                }
                tagger.IncrementCount(label.Word(), label.Tag());
            }
        }
 /// <exception cref="System.IO.IOException"/>
 /// <exception cref="System.TypeLoadException"/>
 public AnCoraProcessor(IList <File> inputFiles, Properties options)
 {
     this.inputFiles = inputFiles;
     this.options    = options;
     if (options.Contains("unigramTagger"))
     {
         ObjectInputStream ois = new ObjectInputStream(new FileInputStream(options.GetProperty("unigramTagger")));
         unigramTagger = (TwoDimensionalCounter <string, string>)ois.ReadObject();
     }
     else
     {
         unigramTagger = new TwoDimensionalCounter <string, string>();
     }
 }
Beispiel #15
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, argOptionDefs);

            if (!options.Contains(string.Empty) || options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            bool retainNER = PropertiesUtils.GetBool(options, "ner", false);
            bool normalize = PropertiesUtils.GetBool(options, "normalize", true);
            File treeFile  = new File(options.GetProperty(string.Empty));
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    UpdateTagger(unigramTagger, t);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100));
                System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100));
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Beispiel #16
0
        public static void UpdateTagger(TwoDimensionalCounter <string, string> tagger, Tree t)
        {
            IList <CoreLabel> yield = t.TaggedLabeledYield();

            foreach (CoreLabel cl in yield)
            {
                if (ResolveDummyTags && cl.Tag().Equals(FrenchXMLTreeReader.MissingPos))
                {
                    continue;
                }
                else
                {
                    tagger.IncrementCount(cl.Word(), cl.Tag());
                }
            }
        }
        internal static void PreprocessMWEs(IDictionary <string, Tree> treeMap)
        {
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            foreach (Tree t in treeMap.Values)
            {
                MWEPreprocessor.CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
            }
            foreach (Tree t_1 in treeMap.Values)
            {
                MWEPreprocessor.TraverseAndFix(t_1, pretermLabel, unigramTagger);
            }
        }
Beispiel #18
0
        /// <summary>Corrects MWE annotations that lack internal POS labels.</summary>
        private void PreprocessMWEs()
        {
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            foreach (Tree t in treebank)
            {
                MWEPreprocessor.CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
            }
            foreach (Tree t_1 in treebank)
            {
                MWEPreprocessor.TraverseAndFix(t_1, pretermLabel, unigramTagger);
            }
        }
 public ScorePatterns(ConstantsAndVariables constVars, GetPatternsFromDataMultiClass.PatternScoring patternScoring, string label, ICollection <CandidatePhrase> allCandidatePhrases, TwoDimensionalCounter <E, CandidatePhrase> patternsandWords4Label
                      , TwoDimensionalCounter <E, CandidatePhrase> negPatternsandWords4Label, TwoDimensionalCounter <E, CandidatePhrase> unLabeledPatternsandWords4Label, Properties props)
 {
     // protected TwoDimensionalCounter<SurfacePattern, String>
     // posnegPatternsandWords4Label = new TwoDimensionalCounter<SurfacePattern,
     // String>();
     //protected TwoDimensionalCounter<E, String> negandUnLabeledPatternsandWords4Label = new TwoDimensionalCounter<E, String>();
     //protected TwoDimensionalCounter<E, String> allPatternsandWords4Label = new TwoDimensionalCounter<E, String>();
     this.constVars                       = constVars;
     this.patternScoring                  = patternScoring;
     this.label                           = label;
     this.allCandidatePhrases             = allCandidatePhrases;
     this.patternsandWords4Label          = patternsandWords4Label;
     this.negPatternsandWords4Label       = negPatternsandWords4Label;
     this.unLabeledPatternsandWords4Label = unLabeledPatternsandWords4Label;
     this.props                           = props;
 }
        private static void CountTaggings(Treebank tb, PrintWriter pw)
        {
            TwoDimensionalCounter <string, string> wtc = new TwoDimensionalCounter <string, string>();

            tb.Apply(null);
            foreach (string key in wtc.FirstKeySet())
            {
                pw.Print(key);
                pw.Print('\t');
                ICounter <string> ctr = wtc.GetCounter(key);
                foreach (string k2 in ctr.KeySet())
                {
                    pw.Print(k2 + '\t' + ctr.GetCount(k2) + '\t');
                }
                pw.Println();
            }
        }
Beispiel #21
0
        public virtual void ApplyPats(ICounter <E> patterns, string label, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords
                                      )
        {
            //   Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
            //   Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
            //    Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
            foreach (KeyValuePair <string, Env> en in constVars.env)
            {
                en.Value.GetVariables().PutAll(ConstantsAndVariables.globalEnv.GetVariables());
            }
            IDictionary <E, IDictionary <string, DataInstance> > sentencesForPatterns = GetSentences(constVars.invertedIndex.QueryIndex(patterns.KeySet()));

            foreach (KeyValuePair <E, IDictionary <string, DataInstance> > en_1 in sentencesForPatterns)
            {
                RunParallelApplyPats(en_1.Value, label, en_1.Key, wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
            }
            Redwood.Log(Redwood.Dbg, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.Size());
        }
Beispiel #22
0
        private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn)
        {
            ITreeFactory          tf       = new LabeledScoredTreeFactory();
            MultiWordTreeExpander expander = new MultiWordTreeExpander();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TraverseAndFix(t, null, unigramTagger, retainNER);
                    // Now "decompress" further the expanded trees formed by
                    // multiword token splitting
                    t = expander.ExpandPhrases(t, tn, tf);
                    if (tn != null)
                    {
                        t = tn.NormalizeWholeTree(t, tf);
                    }
                    pw.Println(t.ToString());
                }
                pw.Close();
                tr.Close();
                System.Console.Out.WriteLine("Processed " + nTrees + " trees");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Beispiel #23
0
 public static void TraverseAndFix(Tree t, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger)
 {
     if (t.IsPreTerminal())
     {
         if (t.Value().Equals(FrenchXMLTreeReader.MissingPos))
         {
             nMissingPOS++;
             string word = t.FirstChild().Value();
             string tag  = (unigramTagger.FirstKeySet().Contains(word)) ? Counters.Argmax(unigramTagger.GetCounter(word)) : MWEPreprocessor.ManualUWModel.GetTag(word);
             t.SetValue(tag);
         }
         return;
     }
     foreach (Tree kid in t.Children())
     {
         TraverseAndFix(kid, pretermLabel, unigramTagger);
     }
     //Post-order visit
     if (t.Value().Equals(FrenchXMLTreeReader.MissingPhrasal))
     {
         nMissingPhrasal++;
         StringBuilder sb = new StringBuilder();
         foreach (Tree kid_1 in t.Children())
         {
             sb.Append(kid_1.Value()).Append(" ");
         }
         string posSequence = sb.ToString().Trim();
         if (pretermLabel.FirstKeySet().Contains(posSequence))
         {
             string phrasalCat = Counters.Argmax(pretermLabel.GetCounter(posSequence));
             t.SetValue(phrasalCat);
         }
         else
         {
             System.Console.Out.WriteLine("No phrasal cat for: " + posSequence);
         }
     }
 }
Beispiel #24
0
        public static void CountMWEStatistics(Tree t, TwoDimensionalCounter <string, string> unigramTagger, TwoDimensionalCounter <string, string> labelPreterm, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string>
                                              labelTerm, TwoDimensionalCounter <string, string> termLabel)
        {
            UpdateTagger(unigramTagger, t);
            //Count MWE statistics
            TregexMatcher m = pMWE.Matcher(t);

            while (m.FindNextMatchingNode())
            {
                Tree   match = m.GetMatch();
                string label = match.Value();
                if (ResolveDummyTags && label.Equals(FrenchXMLTreeReader.MissingPhrasal))
                {
                    continue;
                }
                string preterm = SentenceUtils.ListToString(match.PreTerminalYield());
                string term    = SentenceUtils.ListToString(match.Yield());
                labelPreterm.IncrementCount(label, preterm);
                pretermLabel.IncrementCount(preterm, label);
                labelTerm.IncrementCount(label, term);
                termLabel.IncrementCount(term, label);
            }
        }
Beispiel #25
0
 public static void PrintCounter(TwoDimensionalCounter <string, string> cnt, string fname)
 {
     try
     {
         PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(fname)), false, "UTF-8"));
         foreach (string key in cnt.FirstKeySet())
         {
             foreach (string val in cnt.GetCounter(key).KeySet())
             {
                 pw.Printf("%s\t%s\t%d%n", key, val, (int)cnt.GetCount(key, val));
             }
         }
         pw.Close();
     }
     catch (UnsupportedEncodingException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (FileNotFoundException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Beispiel #26
0
        private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection
                                          <CandidatePhrase> alreadyLabeledWords)
        {
            Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences ");
            IList <string> notAllowedClasses = new List <string>();
            IList <string> sentids           = CollectionUtils.ToList(sents.Keys);

            if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass)
            {
                foreach (string l in constVars.GetAnswerClass().Keys)
                {
                    if (!l.Equals(label))
                    {
                        notAllowedClasses.Add(l);
                    }
                }
                notAllowedClasses.Add("OTHERSEM");
            }
            IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
            IDictionary <SemgrexPattern, E>       depPatternsLearnedThisIterConverted     = null;

            if (constVars.patternType.Equals(PatternFactory.PatternType.Surface))
            {
                surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>();
                string patternStr = null;
                try
                {
                    patternStr = pattern.ToString(notAllowedClasses);
                    TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr));
                    surfacePatternsLearnedThisIterConverted[pat] = pattern;
                }
                catch (Exception e)
                {
                    log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
                    throw;
                }
            }
            else
            {
                if (constVars.patternType.Equals(PatternFactory.PatternType.Dep))
                {
                    depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>();
                    SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables()));
                    depPatternsLearnedThisIterConverted[pat] = pattern;
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            //Apply the patterns and extract candidate phrases
            int num;
            int numThreads = constVars.numThreads;

            //If number of sentences is less, do not create so many threads
            if (sents.Count < 50)
            {
                numThreads = 1;
            }
            if (numThreads == 1)
            {
                num = sents.Count;
            }
            else
            {
                num = sents.Count / (numThreads - 1);
            }
            IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads);
            IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E
                                                                                                                                                                                                                                                                            , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >();

            for (int i = 0; i < numThreads; i++)
            {
                ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null;
                if (pattern.type.Equals(PatternFactory.PatternType.Surface))
                {
                    //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
                    task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                             , constVars);
                }
                else
                {
                    task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                                , constVars);
                }
                IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task);
                list.Add(submit);
            }
            // Now retrieve the result
            foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list)
            {
                try
                {
                    Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get();
                    Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First());
                    wordsandLemmaPatExtracted.AddAll(result.First());
                    matchedTokensByPat.AddAll(result.Second());
                    Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third());
                }
                catch (Exception e)
                {
                    executor.ShutdownNow();
                    throw new Exception(e);
                }
            }
            executor.Shutdown();
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
Beispiel #28
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap
                                                                  <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E
                                                                                                                                                                                                                                                                                                                       , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq)
        {
            ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>();

            if (constVars.doNotApplyPatterns)
            {
                // if want to get the stats by the lossy way of just counting without
                // applying the patterns
                ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
                while (sentsIter.MoveNext())
                {
                    Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current;
                    this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
                }
            }
            else
            {
                if (patternsLearnedThisIter.Size() > 0)
                {
                    this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);
                }
            }
            if (computeProcDataFreq)
            {
                if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None))
                {
                    Redwood.Log(Redwood.Dbg, "computing processed freq");
                    foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet())
                    {
                        double @in = fq.Value;
                        if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt))
                        {
                            @in = Math.Sqrt(@in);
                        }
                        else
                        {
                            if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log))
                            {
                                @in = 1 + Math.Log(@in);
                            }
                            else
                            {
                                throw new Exception("can't understand the normalization");
                            }
                        }
                        System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in);
                        Data.processedDataFreq.SetCount(fq.Key, @in);
                    }
                }
                else
                {
                    Data.processedDataFreq = Data.rawFreq;
                }
            }
            if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm))
            {
                foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet())
                {
                    if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en))
                    {
                        terms.AddAll(en, wordsPatExtracted.GetCounter(en));
                    }
                }
                RemoveKeys(terms, ConstantsAndVariables.GetStopWords());
                ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false);
                System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S.")));
                ICollection <CandidatePhrase> ignoreWordsAll;
                if (ignoreWords != null && !ignoreWords.IsEmpty())
                {
                    ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords());
                }
                else
                {
                    ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords());
                }
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]);
                Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet());
                System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S.")));
                ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);
                phraseScorer.PrintReasonForChoosing(finalwords);
                scoreForAllWordsThisIteration.Clear();
                Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores);
                Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t"));
                if (constVars.goldEntities != null)
                {
                    IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label];
                    if (goldEntities4Label != null)
                    {
                        StringBuilder s = new StringBuilder();
                        finalwords.KeySet().Stream().ForEach(null);
                        Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString());
                    }
                    else
                    {
                        Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label);
                    }
                }
                if (constVars.outDir != null && !constVars.outDir.IsEmpty())
                {
                    string outputdir = constVars.outDir + "/" + identifier + "/" + label;
                    IOUtils.EnsureDir(new File(outputdir));
                    TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>();
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        foreach (E l in wordsPatExtracted.GetCounter(word).KeySet())
                        {
                            foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l))
                            {
                                reasonForWords.IncrementCount(word, w2);
                            }
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
                    string filename = outputdir + "/words.json";
                    // the json object is an array corresponding to each iteration - of list
                    // of objects,
                    // each of which is a bean of entity and reasons
                    IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder();
                    if (writtenInJustification.Contains(label) && writtenInJustification[label])
                    {
                        IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename)));
                        IJsonArray  objarr     = jsonReader.ReadArray();
                        foreach (IJsonValue o in objarr)
                        {
                            obj.Add(o);
                        }
                        jsonReader.Close();
                    }
                    IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder();
                    foreach (CandidatePhrase w in reasonForWords.FirstKeySet())
                    {
                        IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder();
                        IJsonArrayBuilder  l        = Javax.Json.Json.CreateArrayBuilder();
                        foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet())
                        {
                            l.Add(w2.GetPhrase());
                        }
                        IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder();
                        foreach (E p in wordsPatExtracted.GetCounter(w))
                        {
                            pats.Add(p.ToStringSimple());
                        }
                        objinner.Add("reasonwords", l);
                        objinner.Add("patterns", pats);
                        objinner.Add("score", finalwords.GetCount(w));
                        objinner.Add("entity", w.GetPhrase());
                        objThisIter.Add(objinner.Build());
                    }
                    obj.Add(objThisIter);
                    // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
                    // "Writing justification at " + filename);
                    IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII");
                    writtenInJustification[label] = true;
                }
                if (constVars.justify)
                {
                    Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n");
                    foreach (CandidatePhrase word in finalwords.KeySet())
                    {
                        Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n"));
                    }
                }
                // if (usePatternResultAsLabel)
                // if (answerLabel != null)
                // labelWords(sents, commonEngWords, finalwords.keySet(),
                // patterns.keySet(), outFile);
                // else
                // throw new RuntimeException("why is the answer label null?");
                return(finalwords);
            }
            else
            {
                if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb))
                {
                    Counters.AddInPlace(terms, wordsPatExtracted);
                    ICounter <CandidatePhrase>       maxPatWeightTerms = new ClassicCounter <CandidatePhrase>();
                    IDictionary <CandidatePhrase, E> wordMaxPat        = new Dictionary <CandidatePhrase, E>();
                    foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet())
                    {
                        ICounter <E> weights = new ClassicCounter <E>();
                        foreach (E k in en.Value.KeySet())
                        {
                            weights.SetCount(k, patternsLearnedThisIter.GetCount(k));
                        }
                        maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights));
                        wordMaxPat[en.Key] = Counters.Argmax(weights);
                    }
                    Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords);
                    double maxvalue = Counters.Max(maxPatWeightTerms);
                    ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10);
                    CandidatePhrase bestw = null;
                    if (words.Count > 1)
                    {
                        double max = double.NegativeInfinity;
                        foreach (CandidatePhrase w in words)
                        {
                            if (terms.GetCount(w, wordMaxPat[w]) > max)
                            {
                                max   = terms.GetCount(w, wordMaxPat[w]);
                                bestw = w;
                            }
                        }
                    }
                    else
                    {
                        if (words.Count == 1)
                        {
                            bestw = words.GetEnumerator().Current;
                        }
                        else
                        {
                            return(new ClassicCounter <CandidatePhrase>());
                        }
                    }
                    Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);
                    return(Counters.AsCounter(Arrays.AsList(bestw)));
                }
                else
                {
                    throw new Exception("wordscoring " + constVars.wordScoring + " not identified");
                }
            }
        }
Beispiel #29
0
 /*
  * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
  * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
  * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
  * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
  * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
  * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");
  *
  * for(Entry<Integer, Double> en: patterns.entrySet()){
  * Integer pindex = en.getKey();
  * SurfacePattern p = constVars.getPatternIndex().get(pindex);
  * String[] n = p.getSimplerTokensNext();
  * String[] pr = p.getSimplerTokensPrev();
  * boolean rest = false;
  * if(n!=null){
  * for(String e: n){
  * if(!specialWords.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest == false && pr!=null){
  * for(String e: pr){
  * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest)
  * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
  * else
  * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
  * }
  *
  *
  *
  * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());
  *
  * if (constVars.batchProcessSents) {
  * List<File> filesToLoad;
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
  * filesToLoad = Data.sentsFiles;
  * else{
  * filesToLoad = new ArrayList<File>();
  * for (String fname : sentidswithfilerest.keySet()) {
  * String filename;
  * //          if(!constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname;
  * //          else
  * filename = fname;
  * filesToLoad.add(new File(filename));
  * }
  * }
  *
  * for (File fname : filesToLoad) {
  * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname);
  *
  * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){
  *
  * String filename;
  * //          if(constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
  * //          else
  * filename = fname.getAbsolutePath();
  *
  * Set<String> sentIDs = sentidswithfilerest.get(filename);
  * if (sentIDs != null){
  * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  *
  * if (computeDataFreq){
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
  * }
  * }
  *
  * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
  * if(computeDataFreq){
  * for(File f: Data.sentsFiles){
  * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(f.getName());
  * }
  * }
  * }
  *
  * } else {
  *
  * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
  * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
  * Set<String> sentids = sentidswithfilerest.get(filename);
  * if (sentids != null) {
  * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
  * }
  * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  * }
  */
 private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted)
 {
     foreach (KeyValuePair <string, DataInstance> sentEn in sents)
     {
         IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key);
         if (pat4Sent == null)
         {
             throw new Exception("How come there are no patterns for " + sentEn.Key);
         }
         foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent)
         {
             CoreLabel       token = null;
             ICollection <E> p1    = en.Value;
             //        Set<Integer> p1 = en.getValue().first();
             //        Set<Integer> p2 = en.getValue().second();
             //        Set<Integer> p3 = en.getValue().third();
             foreach (E index in patternsLearnedThisIter.KeySet())
             {
                 if (p1.Contains(index))
                 {
                     if (token == null)
                     {
                         token = sentEn.Value.GetTokens()[en.Key];
                     }
                     wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index);
                 }
             }
         }
     }
 }
Beispiel #30
0
        public virtual ICounter <CandidatePhrase> ChooseTopWords(ICounter <CandidatePhrase> newdt, TwoDimensionalCounter <CandidatePhrase, E> terms, ICounter <CandidatePhrase> useThresholdNumPatternsForTheseWords, ICollection <CandidatePhrase> ignoreWords
                                                                 , double thresholdWordExtract)
        {
            IEnumerator <CandidatePhrase> termIter   = Counters.ToPriorityQueue(newdt).GetEnumerator();
            ICounter <CandidatePhrase>    finalwords = new ClassicCounter <CandidatePhrase>();

            while (termIter.MoveNext())
            {
                if (finalwords.Size() >= constVars.numWordsToAdd)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                if (newdt.GetCount(w) < thresholdWordExtract)
                {
                    Redwood.Log(ConstantsAndVariables.extremedebug, "not adding word " + w + " and any later words because the score " + newdt.GetCount(w) + " is less than the threshold of  " + thresholdWordExtract);
                    break;
                }
                System.Diagnostics.Debug.Assert((newdt.GetCount(w) != double.PositiveInfinity));
                if (useThresholdNumPatternsForTheseWords.ContainsKey(w) && NumNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied)
                {
                    Redwood.Log("extremePatDebug", "Not adding " + w + " because the number of non redundant patterns are below threshold of " + constVars.thresholdNumPatternsApplied + ":" + terms.GetCounter(w).KeySet());
                    continue;
                }
                CandidatePhrase matchedFuzzy = null;
                if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
                {
                    matchedFuzzy = ConstantsAndVariables.ContainsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
                }
                if (matchedFuzzy == null)
                {
                    Redwood.Log("extremePatDebug", "adding word " + w);
                    finalwords.SetCount(w, newdt.GetCount(w));
                }
                else
                {
                    Redwood.Log("extremePatDebug", "not adding " + w + " because it matched " + matchedFuzzy + " in common English word");
                    ignoreWords.Add(w);
                }
            }
            string nextTen = string.Empty;
            int    n       = 0;

            while (termIter.MoveNext())
            {
                n++;
                if (n > 10)
                {
                    break;
                }
                CandidatePhrase w = termIter.Current;
                nextTen += ";\t" + w + ":" + newdt.GetCount(w);
            }
            Redwood.Log(Redwood.Dbg, "Next ten phrases were " + nextTen);
            return(finalwords);
        }