Esempio n. 1
0
 private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger)
 {
     try
     {
         BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
         ITreeReaderFactory trf = new FrenchTreeReaderFactory();
         ITreeReader        tr  = trf.NewTreeReader(br);
         PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
         int nTrees             = 0;
         for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
         {
             TraverseAndFix(t, pretermLabel, unigramTagger);
             pw.Println(t.ToString());
         }
         pw.Close();
         tr.Close();
         System.Console.Out.WriteLine("Processed " + nTrees + " trees");
     }
     catch (UnsupportedEncodingException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (FileNotFoundException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Esempio n. 2
0
        //Delete sentence-initial punctuation
        //Delete sentence final punctuation that is preceded by punctuation (first time)
        //Delete sentence final punctuation that is preceded by punctuation (second time)
        //Convert remaining sentence-final punctuation to either . if it is not [.!?]
        //Delete medial, sentence-final punctuation
        //Now move the sentence-final mark under SENT
        //For those trees that lack a sentence-final punc, add one.
        //Finally, delete these punctuation marks, which I can't seem to kill otherwise...
        //A bad MWADV tree in the training set
        // Not sure why this got a label of X.  Similar trees suggest it
        // should be A instead
        // This also seems to be mislabeled
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n");
                System.Environment.Exit(-1);
            }
            ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector();
            File             f  = new File(args[0]);

            try
            {
                //These bad trees in the Candito training set should be thrown out:
                //  (ROOT (SENT (" ") (. .)))
                //  (ROOT (SENT (. .)))
                TregexPattern      pBadTree  = TregexPattern.Compile("@SENT <: @PUNC");
                TregexPattern      pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
                BufferedReader     br        = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
                ITreeReaderFactory trf       = new FrenchTreeReaderFactory();
                ITreeReader        tr        = trf.NewTreeReader(br);
                int nTrees = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TregexMatcher m  = pBadTree.Matcher(t);
                    TregexMatcher m2 = pBadTree2.Matcher(t);
                    if (m.Find() || m2.Find())
                    {
                        log.Info("Discarding tree: " + t.ToString());
                    }
                    else
                    {
                        Tree fixedT = tt.TransformTree(t);
                        System.Console.Out.WriteLine(fixedT.ToString());
                    }
                }
                tr.Close();
                System.Console.Error.Printf("Wrote %d trees%n", nTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (TregexParseException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 3
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName);
                System.Environment.Exit(-1);
            }
            File treeFile = new File(args[0]);
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new FrenchTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}");
                PrintCounter(labelTerm, "label_term.csv");
                System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}");
                PrintCounter(termLabel, "term_label.csv");
                System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}");
                PrintCounter(labelPreterm, "label_pos.csv");
                System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}");
                PrintCounter(pretermLabel, "pos_label.csv");
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, pretermLabel, unigramTagger);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS);
                System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal);
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 4
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile     = args[0];
            string             morfetteFile = args[1];
            ITreeReaderFactory trf          = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile);
                for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();)
                {
                    IList <CoreLabel> analysis = morfetteItr.Current;
                    IList <ILabel>    yield    = tree.Yield();
                    System.Diagnostics.Debug.Assert(analysis.Count == yield.Count);
                    int yieldLen = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel tokenAnalysis = analysis[i];
                        ILabel    token         = yield[i];
                        string    lemma         = GetLemma(token.Value(), tokenAnalysis.Lemma());
                        string    newLeaf       = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag());
                        ((CoreLabel)token).SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                if (tr.ReadTree() != null || morfetteItr.MoveNext())
                {
                    log.Info("WARNING: Uneven input files!");
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile = args[0];
            ITreeReaderFactory trf      = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                for (Tree tree1; (tree1 = tr.ReadTree()) != null;)
                {
                    IList <ILabel> pretermYield = tree1.PreTerminalYield();
                    IList <ILabel> yield        = tree1.Yield();
                    int            yieldLen     = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel             rawToken   = (CoreLabel)yield[i];
                        string                word       = rawToken.Value();
                        string                morphStr   = rawToken.OriginalText();
                        Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr);
                        string                lemma      = lemmaMorph.First();
                        string                morph      = lemmaMorph.Second();
                        if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX"))
                        {
                            morph = ((CoreLabel)pretermYield[i]).Value();
                        }
                        System.Console.Out.Printf("%s %s %s%n", word, lemma, morph);
                    }
                    System.Console.Out.WriteLine();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName);
                System.Environment.Exit(-1);
            }
            File treeFile = new File(args[0]);
            TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>();
            ICollection <string> uniquePOSSequences = Generics.NewHashSet();

            try
            {
                BufferedReader     br   = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf  = new FrenchTreeReaderFactory();
                ITreeReader        tr   = trf.NewTreeReader(br);
                TregexPattern      pMWE = TregexPattern.Compile("/^MW/");
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    //Count MWE statistics
                    TregexMatcher m = pMWE.Matcher(t);
                    while (m.FindNextMatchingNode())
                    {
                        Tree              match     = m.GetMatch();
                        string            label     = match.Value();
                        IList <CoreLabel> yield     = match.TaggedLabeledYield();
                        StringBuilder     termYield = new StringBuilder();
                        StringBuilder     posYield  = new StringBuilder();
                        foreach (CoreLabel cl in yield)
                        {
                            termYield.Append(cl.Word()).Append(" ");
                            posYield.Append(cl.Tag()).Append(" ");
                        }
                        mweLabelToString.IncrementCount(label, termYield.ToString().Trim());
                        uniquePOSSequences.Add(posYield.ToString().Trim());
                    }
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
                double nMWEs          = mweLabelToString.TotalCount();
                int    nAllSingletons = 0;
                int    nTokens        = 0;
                foreach (string mweLabel in mweLabelToString.FirstKeySet())
                {
                    int               nSingletons = 0;
                    double            totalCount  = mweLabelToString.TotalCount(mweLabel);
                    ICounter <string> mc          = mweLabelToString.GetCounter(mweLabel);
                    foreach (string term in mc.KeySet())
                    {
                        if (mc.GetCount(term) == 1.0)
                        {
                            nSingletons++;
                        }
                        nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term);
                    }
                    nAllSingletons += nSingletons;
                    System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
                }
                System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
                System.Console.Out.WriteLine("#tokens = " + nTokens);
                System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (TregexParseException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }