Esempio n. 1
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile     = args[0];
            string             morfetteFile = args[1];
            ITreeReaderFactory trf          = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile);
                for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();)
                {
                    IList <CoreLabel> analysis = morfetteItr.Current;
                    IList <ILabel>    yield    = tree.Yield();
                    System.Diagnostics.Debug.Assert(analysis.Count == yield.Count);
                    int yieldLen = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel tokenAnalysis = analysis[i];
                        ILabel    token         = yield[i];
                        string    lemma         = GetLemma(token.Value(), tokenAnalysis.Lemma());
                        string    newLeaf       = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag());
                        ((CoreLabel)token).SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                if (tr.ReadTree() != null || morfetteItr.MoveNext())
                {
                    log.Info("WARNING: Uneven input files!");
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 2
0
 /// <summary>Load a collection of parse trees from a Reader.</summary>
 /// <remarks>
 /// Load a collection of parse trees from a Reader.
 /// Each tree may optionally be encased in parens to allow for Penn
 /// Treebank style trees.
 /// </remarks>
 /// <param name="r">
 /// The reader to read trees from.  (If you want it buffered,
 /// you should already have buffered it!)
 /// </param>
 /// <param name="id">
 /// An ID for where these files come from (arbitrary, but
 /// something like a filename.  Can be <code>null</code> for none.
 /// </param>
 public void Load(Reader r, string id)
 {
     try
     {
         // could throw an IO exception?
         ITreeReader tr        = TreeReaderFactory().NewTreeReader(r);
         int         sentIndex = 0;
         for (Tree pt; (pt = tr.ReadTree()) != null;)
         {
             if (pt.Label() is IHasIndex)
             {
                 // so we can trace where this tree came from
                 IHasIndex hi = (IHasIndex)pt.Label();
                 if (id != null)
                 {
                     hi.SetDocID(id);
                 }
                 hi.SetSentIndex(sentIndex);
             }
             parseTrees.Add(pt);
             sentIndex++;
         }
     }
     catch (IOException e)
     {
         log.Info("load IO Exception: " + e);
     }
 }
        /// <exception cref="System.IO.IOException"/>
        public virtual void Process()
        {
            SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory();
            Tree t;

            foreach (File file in fileList)
            {
                Reader      @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), AncoraEncoding));
                ITreeReader tr  = trf.NewTreeReader(@in);
                // Tree reading will implicitly perform tree normalization for us
                while ((t = tr.ReadTree()) != null)
                {
                    // Update tagger with this tree
                    IList <CoreLabel> yield = t.TaggedLabeledYield();
                    foreach (CoreLabel leafLabel in yield)
                    {
                        if (leafLabel.Tag().Equals(SpanishTreeNormalizer.MwTag))
                        {
                            continue;
                        }
                        unigramTagger.IncrementCount(leafLabel.Word(), leafLabel.Tag());
                    }
                }
            }
        }
Esempio n. 4
0
        /// <param name="args">File to run on</param>
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                System.Console.Out.Printf("Usage: java %s tree_file%n", typeof(Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory).FullName);
                return;
            }
            ITreebankLanguagePack tlp = new NegraPennLanguagePack();
            ITreeReaderFactory    trf = new Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory(2, false, false, tlp);

            try
            {
                ITreeReader tr = trf.NewTreeReader(IOUtils.ReaderFromString(args[0], tlp.GetEncoding()));
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    t.PennPrint();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 5
0
 private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger)
 {
     try
     {
         BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
         ITreeReaderFactory trf = new FrenchTreeReaderFactory();
         ITreeReader        tr  = trf.NewTreeReader(br);
         PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
         int nTrees             = 0;
         for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
         {
             TraverseAndFix(t, pretermLabel, unigramTagger);
             pw.Println(t.ToString());
         }
         pw.Close();
         tr.Close();
         System.Console.Out.WriteLine("Processed " + nTrees + " trees");
     }
     catch (UnsupportedEncodingException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (FileNotFoundException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
     catch (IOException e)
     {
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
        /// <summary>Processes a single file containing AnCora XML trees.</summary>
        /// <remarks>
        /// Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in
        /// the file and the actual parsed trees.
        /// </remarks>
        private static Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ProcessTreeFile(File file, SpanishXMLTreeReaderFactory trf, string encoding)
        {
            TwoDimensionalCounter <string, string> tagger = new TwoDimensionalCounter <string, string>();

            try
            {
                Reader       @in   = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding));
                ITreeReader  tr    = trf.NewTreeReader(file.GetPath(), @in);
                IList <Tree> trees = new List <Tree>();
                Tree         t;
                Tree         splitPoint;
                while ((t = tr.ReadTree()) != null)
                {
                    do
                    {
                        // We may need to split the current tree into multiple parts.
                        // (If not, a call to `split` with a `null` split-point is a
                        // no-op
                        splitPoint = FindSplitPoint(t);
                        Pair <Tree, Tree> split = Split(t, splitPoint);
                        Tree toAdd = split.First();
                        t = split.Second();
                        trees.Add(toAdd);
                        UpdateTagger(tagger, toAdd);
                    }while (splitPoint != null);
                }
                tr.Close();
                return(new Pair <TwoDimensionalCounter <string, string>, IList <Tree> >(tagger, trees));
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                return(null);
            }
        }
Esempio n. 7
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(SplitMaker).FullName);
                System.Environment.Exit(-1);
            }
            ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack();
            string inputFile          = args[0];
            File   treeFile           = new File(inputFile);

            try
            {
                ITreeReaderFactory trf     = new HebrewTreeReaderFactory();
                BufferedReader     br      = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding()));
                ITreeReader        tr      = trf.NewTreeReader(br);
                PrintWriter        pwDev   = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.dev"), false, tlp.GetEncoding()));
                PrintWriter        pwTrain = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.train"), false, tlp.GetEncoding()));
                PrintWriter        pwTest  = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.test"), false, tlp.GetEncoding()));
                int numTrees = 0;
                for (Tree t; ((t = tr.ReadTree()) != null); numTrees++)
                {
                    if (numTrees < 483)
                    {
                        pwDev.Println(t.ToString());
                    }
                    else
                    {
                        if (numTrees >= 483 && numTrees < 5724)
                        {
                            pwTrain.Println(t.ToString());
                        }
                        else
                        {
                            pwTest.Println(t.ToString());
                        }
                    }
                }
                tr.Close();
                pwDev.Close();
                pwTrain.Close();
                pwTest.Close();
                System.Console.Error.Printf("Processed %d trees.%n", numTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>For debugging.</summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file(s)%n%n", typeof(Edu.Stanford.Nlp.Trees.International.French.FrenchXMLTreeReader).FullName);
                System.Environment.Exit(-1);
            }
            IList <File> fileList = new List <File>();

            foreach (string arg in args)
            {
                fileList.Add(new File(arg));
            }
            ITreeReaderFactory trf             = new FrenchXMLTreeReaderFactory(false);
            int totalTrees                     = 0;
            ICollection <string> morphAnalyses = Generics.NewHashSet();

            try
            {
                foreach (File file in fileList)
                {
                    ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")));
                    Tree        t;
                    int         numTrees;
                    string      canonicalFileName = Sharpen.Runtime.Substring(file.GetName(), 0, file.GetName().LastIndexOf('.'));
                    for (numTrees = 0; (t = tr.ReadTree()) != null; numTrees++)
                    {
                        string ftbID = ((CoreLabel)t.Label()).Get(typeof(CoreAnnotations.SentenceIDAnnotation));
                        System.Console.Out.Printf("%s-%s\t%s%n", canonicalFileName, ftbID, t.ToString());
                        IList <ILabel> leaves = t.Yield();
                        foreach (ILabel label in leaves)
                        {
                            if (label is CoreLabel)
                            {
                                morphAnalyses.Add(((CoreLabel)label).OriginalText());
                            }
                        }
                    }
                    tr.Close();
                    System.Console.Error.Printf("%s: %d trees%n", file.GetName(), numTrees);
                    totalTrees += numTrees;
                }
                //wsg2011: Print out the observed morphological analyses
                //      for(String analysis : morphAnalyses)
                //        log.info(analysis);
                System.Console.Error.Printf("%nRead %d trees%n", totalTrees);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>Reads a single tree.</summary>
        /// <returns>A single tree, or <code>null</code> at end of file.</returns>
        /// <exception cref="System.IO.IOException"/>
        public virtual Tree ReadTree()
        {
            Tree t;

            do
            {
                t = tr.ReadTree();
            }while (t != null && !f.Test(t));
            return(t);
        }
Esempio n. 10
0
        //Delete sentence-initial punctuation
        //Delete sentence final punctuation that is preceded by punctuation (first time)
        //Delete sentence final punctuation that is preceded by punctuation (second time)
        //Convert remaining sentence-final punctuation to either . if it is not [.!?]
        //Delete medial, sentence-final punctuation
        //Now move the sentence-final mark under SENT
        //For those trees that lack a sentence-final punc, add one.
        //Finally, delete these punctuation marks, which I can't seem to kill otherwise...
        //A bad MWADV tree in the training set
        // Not sure why this got a label of X.  Similar trees suggest it
        // should be A instead
        // This also seems to be mislabeled
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n");
                System.Environment.Exit(-1);
            }
            ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector();
            File             f  = new File(args[0]);

            try
            {
                //These bad trees in the Candito training set should be thrown out:
                //  (ROOT (SENT (" ") (. .)))
                //  (ROOT (SENT (. .)))
                TregexPattern      pBadTree  = TregexPattern.Compile("@SENT <: @PUNC");
                TregexPattern      pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __");
                BufferedReader     br        = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
                ITreeReaderFactory trf       = new FrenchTreeReaderFactory();
                ITreeReader        tr        = trf.NewTreeReader(br);
                int nTrees = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TregexMatcher m  = pBadTree.Matcher(t);
                    TregexMatcher m2 = pBadTree2.Matcher(t);
                    if (m.Find() || m2.Find())
                    {
                        log.Info("Discarding tree: " + t.ToString());
                    }
                    else
                    {
                        Tree fixedT = tt.TransformTree(t);
                        System.Console.Out.WriteLine(fixedT.ToString());
                    }
                }
                tr.Close();
                System.Console.Error.Printf("Wrote %d trees%n", nTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (TregexParseException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 11
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName);
                System.Environment.Exit(-1);
            }
            File treeFile = new File(args[0]);
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new FrenchTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}");
                PrintCounter(labelTerm, "label_term.csv");
                System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}");
                PrintCounter(termLabel, "term_label.csv");
                System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}");
                PrintCounter(labelPreterm, "label_pos.csv");
                System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}");
                PrintCounter(pretermLabel, "pos_label.csv");
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, pretermLabel, unigramTagger);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS);
                System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal);
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile = args[0];
            ITreeReaderFactory trf      = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                for (Tree tree1; (tree1 = tr.ReadTree()) != null;)
                {
                    IList <ILabel> pretermYield = tree1.PreTerminalYield();
                    IList <ILabel> yield        = tree1.Yield();
                    int            yieldLen     = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel             rawToken   = (CoreLabel)yield[i];
                        string                word       = rawToken.Value();
                        string                morphStr   = rawToken.OriginalText();
                        Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr);
                        string                lemma      = lemmaMorph.First();
                        string                morph      = lemmaMorph.Second();
                        if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX"))
                        {
                            morph = ((CoreLabel)pretermYield[i]).Value();
                        }
                        System.Console.Out.Printf("%s %s %s%n", word, lemma, morph);
                    }
                    System.Console.Out.WriteLine();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 13
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, argOptionDefs);

            if (!options.Contains(string.Empty) || options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            bool retainNER = PropertiesUtils.GetBool(options, "ner", false);
            bool normalize = PropertiesUtils.GetBool(options, "normalize", true);
            File treeFile  = new File(options.GetProperty(string.Empty));
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    UpdateTagger(unigramTagger, t);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100));
                System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100));
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 14
0
        private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn)
        {
            ITreeFactory          tf       = new LabeledScoredTreeFactory();
            MultiWordTreeExpander expander = new MultiWordTreeExpander();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TraverseAndFix(t, null, unigramTagger, retainNER);
                    // Now "decompress" further the expanded trees formed by
                    // multiword token splitting
                    t = expander.ExpandPhrases(t, tn, tf);
                    if (tn != null)
                    {
                        t = tn.NormalizeWholeTree(t, tf);
                    }
                    pw.Println(t.ToString());
                }
                pw.Close();
                tr.Close();
                System.Console.Out.WriteLine("Processed " + nTrees + " trees");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <summary>
        /// Read trees from the given file and output their processed forms to
        /// standard output.
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Process(File file, ITreeReader tr, Pattern posPattern, Pattern wordPattern, bool plainPrint)
        {
            Tree   t;
            int    numTrees          = 0;
            int    numTreesRetained  = 0;
            string canonicalFileName = Sharpen.Runtime.Substring(file.GetName(), 0, file.GetName().LastIndexOf('.'));

            while ((t = tr.ReadTree()) != null)
            {
                numTrees++;
                if (!ShouldPrintTree(t, posPattern, wordPattern))
                {
                    continue;
                }
                numTreesRetained++;
                string ftbID  = ((CoreLabel)t.Label()).Get(typeof(CoreAnnotations.SentenceIDAnnotation));
                string output = ToString(t, plainPrint);
                System.Console.Out.Printf("%s-%s\t%s%n", canonicalFileName, ftbID, output);
            }
            System.Console.Error.Printf("%s: %d trees, %d matched and printed%n", file.GetName(), numTrees, numTreesRetained);
        }
        /// <summary>For debugging.</summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal).FullName);
                System.Environment.Exit(-1);
            }
            ITreeReaderFactory trf = new ArabicTreeReaderFactory();

            try
            {
                ITreeReader  tr      = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")));
                ITreeVisitor visitor = new Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal();
                int          treeId  = 0;
                for (Tree tree; (tree = tr.ReadTree()) != null; ++treeId)
                {
                    if (tree.Value().Equals("ROOT"))
                    {
                        // Skip over the ROOT tag
                        tree = tree.FirstChild();
                    }
                    visitor.VisitTree(tree);
                    System.Console.Out.WriteLine(tree.ToString());
                }
                tr.Close();
                System.Console.Error.Printf("Processed %d trees.%n", treeId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 17
0
 private void PrimeNext()
 {
     try
     {
         if (treeReader != null)
         {
             Tree tree = treeReader.ReadTree();
             if (tree == null)
             {
                 nextYield = null;
             }
             else
             {
                 IList <CoreLabel> mLabeledLeaves = tree.TaggedLabeledYield();
                 nextYield = new List <string>(mLabeledLeaves.Count);
                 foreach (CoreLabel label in mLabeledLeaves)
                 {
                     nextYield.Add(label.Tag());
                 }
             }
         }
         else
         {
             string line = fileReader.ReadLine();
             if (line == null)
             {
                 nextYield = null;
             }
             else
             {
                 nextYield = Arrays.AsList(line.Split("\\s+"));
             }
         }
     }
     catch (IOException e)
     {
         nextYield = null;
         Sharpen.Runtime.PrintStackTrace(e);
     }
 }
Esempio n. 18
0
        //Delete sentence-initial punctuation
        //Delete sentence-initial punctuation (again)
        //Delete sentence final punctuation that is preceded by punctuation (first time)
        //Delete sentence final punctuation that is preceded by punctuation (second time)
        //Convert remaining sentence-final punctuation to . if it is not [.!?]
        //Delete medial, sentence-final punctuation
        //    ("@PUNC=punc <: /[!\\.\\?]+/ $. __\n"
        //        + "prune punc\n"
        //        + "\n") +
        //Now move the sentence-final mark under the top-level node
        //For those trees that lack a sentence-final punc, add one.
        //    ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n"
        //        + "insert (PUNC .) $- loc\n"
        //        + "\n");
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector).FullName + " filename\n");
                System.Environment.Exit(-1);
            }
            ITreeTransformer tt = new Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector();
            File             f  = new File(args[0]);

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"));
                ITreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    Tree fixedT = tt.TransformTree(t);
                    System.Console.Out.WriteLine(fixedT.ToString());
                }
                tr.Close();
                System.Console.Error.Printf("Wrote %d trees%n", nTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file > trees%n", typeof(HebrewTreeReaderFactory).FullName);
                System.Environment.Exit(-1);
            }
            ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack();
            File treeFile             = new File(args[0]);

            try
            {
                ITreeReaderFactory trf = new HebrewTreeReaderFactory();
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding()));
                ITreeReader        tr  = trf.NewTreeReader(br);
                int numTrees           = 0;
                for (Tree t; ((t = tr.ReadTree()) != null); numTrees++)
                {
                    System.Console.Out.WriteLine(t.ToString());
                }
                tr.Close();
                System.Console.Error.Printf("Processed %d trees.%n", numTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Esempio n. 20
0
        /// <summary>Load a collection of parse trees from the file of given name.</summary>
        /// <remarks>
        /// Load a collection of parse trees from the file of given name.
        /// Each tree may optionally be encased in parens to allow for Penn
        /// Treebank style trees.
        /// This methods implements the <code>FileProcessor</code> interface.
        /// </remarks>
        /// <param name="file">file to load a tree from</param>
        public void ProcessFile(File file)
        {
            ITreeReader tr = null;
            // SRL stuff
            CollectionValuedMap <int, string> srlMap = null;

            if (this.srlMap != null)
            {
                // there must be a better way ...
                string filename = file.GetAbsolutePath();
                foreach (string suffix in this.srlMap.Keys)
                {
                    if (filename.EndsWith(suffix))
                    {
                        srlMap = this.srlMap[suffix];
                        break;
                    }
                }
                if (srlMap == null)
                {
                    log.Info("could not find SRL entries for file: " + file);
                }
            }
            try
            {
                // maybe print file name to stdout to get some feedback
                // could throw an IO exception if can't open for reading
                tr = TreeReaderFactory().NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), Encoding())));
                int  sentIndex = 0;
                Tree pt;
                while ((pt = tr.ReadTree()) != null)
                {
                    if (pt.Label() is IHasIndex)
                    {
                        // so we can trace where this tree came from
                        IHasIndex hi = (IHasIndex)pt.Label();
                        hi.SetDocID(file.GetName());
                        hi.SetSentIndex(sentIndex);
                    }
                    if (srlMap == null)
                    {
                        parseTrees.Add(pt);
                    }
                    else
                    {
                        ICollection <string> srls = srlMap[sentIndex];
                        //           pt.pennPrint();
                        //           log.info(srls);
                        parseTrees.Add(pt);
                        if (srls.IsEmpty())
                        {
                        }
                        else
                        {
                            //            parseTrees.add(pt);
                            foreach (string srl in srls)
                            {
                                //              Tree t = pt.deepCopy();
                                string[] bits      = srl.Split("\\s+");
                                int      verbIndex = System.Convert.ToInt32(bits[0]);
                                string   lemma     = bits[2].Split("\\.")[0];
                                //              Tree verb = Trees.getTerminal(t, verbIndex);
                                Tree verb = Edu.Stanford.Nlp.Trees.Trees.GetTerminal(pt, verbIndex);
                                //              ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL);
                                ((CoreLabel)verb.Label()).Set(typeof(CoreAnnotations.CoNLLPredicateAnnotation), true);
                                for (int i = 4; i < bits.Length; i++)
                                {
                                    string   arg = bits[i];
                                    string[] bits1;
                                    if (arg.IndexOf("ARGM") >= 0)
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    else
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    string locs    = bits1[0];
                                    string argType = bits1[1];
                                    if (argType.Equals("rel"))
                                    {
                                        continue;
                                    }
                                    foreach (string loc in locs.Split("[*,]"))
                                    {
                                        bits1 = loc.Split(":");
                                        int term   = System.Convert.ToInt32(bits1[0]);
                                        int height = System.Convert.ToInt32(bits1[1]);
                                        //                  Tree t1 = Trees.getPreTerminal(t, term);
                                        Tree t1 = Edu.Stanford.Nlp.Trees.Trees.GetPreTerminal(pt, term);
                                        for (int j = 0; j < height; j++)
                                        {
                                            //                    t1 = t1.parent(t);
                                            t1 = t1.Parent(pt);
                                        }
                                        IDictionary <int, string> roleMap = ((CoreLabel)t1.Label()).Get(typeof(CoreAnnotations.CoNLLSRLAnnotation));
                                        if (roleMap == null)
                                        {
                                            roleMap = Generics.NewHashMap();
                                            ((CoreLabel)t1.Label()).Set(typeof(CoreAnnotations.CoNLLSRLAnnotation), roleMap);
                                        }
                                        roleMap[verbIndex] = argType;
                                    }
                                }
                            }
                        }
                    }
                    //                  ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, SRL_ID.ARG);
                    //               for (Tree t1 : t) {
                    //                 if (t1.isLeaf()) { continue; }
                    //                 CoreLabel fl = (CoreLabel)t1.label();
                    //                 if (fl.value() == null) { continue; }
                    //                 if (!fl.has(SRLIDAnnotation.class)) {
                    //                   boolean allNone = true;
                    //                   for (Tree t2 : t1) {
                    //                     SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class);
                    //                     if (s == SRL_ID.ARG || s == SRL_ID.REL) {
                    //                       allNone = false;
                    //                       break;
                    //                     }
                    //                   }
                    //                   if (allNone) {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO);
                    //                   } else {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.NO);
                    //                   }
                    //                 }
                    //               }
                    //              parseTrees.add(t);
                    sentIndex++;
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException("MemoryTreebank.processFile IOException in file " + file, e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(tr);
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToTSV).FullName);
                System.Environment.Exit(-1);
            }
            string treeFile = args[0];

            try
            {
                BufferedReader     br        = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf       = new SpanishTreeReaderFactory();
                ITreeReader        tr        = trf.NewTreeReader(br);
                StringBuilder      sb        = new StringBuilder();
                string             nl        = Runtime.GetProperty("line.separator");
                Pattern            nePattern = Pattern.Compile("^grup\\.nom\\.");
                Pattern            npPattern = Pattern.Compile("^np0000.$");
                for (Tree tree; (tree = tr.ReadTree()) != null;)
                {
                    foreach (Tree t in tree)
                    {
                        if (!t.IsPreTerminal())
                        {
                            continue;
                        }
                        char   type         = 'O';
                        Tree   grandma      = t.Ancestor(1, tree);
                        string grandmaValue = ((CoreLabel)grandma.Label()).Value();
                        // grup.nom.x
                        if (nePattern.Matcher(grandmaValue).Find())
                        {
                            type = grandmaValue[9];
                        }
                        else
                        {
                            // else check the pos for np0000x or not
                            string pos = ((CoreLabel)t.Label()).Value();
                            if (npPattern.Matcher(pos).Find())
                            {
                                type = pos[6];
                            }
                        }
                        Tree   wordNode = t.FirstChild();
                        string word     = ((CoreLabel)wordNode.Label()).Value();
                        sb.Append(word).Append("\t");
                        switch (type)
                        {
                        case 'p':
                        {
                            sb.Append("PERS");
                            break;
                        }

                        case 'l':
                        {
                            sb.Append("LUG");
                            break;
                        }

                        case 'o':
                        {
                            sb.Append("ORG");
                            break;
                        }

                        case '0':
                        {
                            sb.Append("OTROS");
                            break;
                        }

                        default:
                        {
                            sb.Append("O");
                            break;
                        }
                        }
                        sb.Append(nl);
                    }
                    sb.Append(nl);
                }
                System.Console.Out.Write(sb.ToString());
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName);
                System.Environment.Exit(-1);
            }
            File treeFile = new File(args[0]);
            TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>();
            ICollection <string> uniquePOSSequences = Generics.NewHashSet();

            try
            {
                BufferedReader     br   = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf  = new FrenchTreeReaderFactory();
                ITreeReader        tr   = trf.NewTreeReader(br);
                TregexPattern      pMWE = TregexPattern.Compile("/^MW/");
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    //Count MWE statistics
                    TregexMatcher m = pMWE.Matcher(t);
                    while (m.FindNextMatchingNode())
                    {
                        Tree              match     = m.GetMatch();
                        string            label     = match.Value();
                        IList <CoreLabel> yield     = match.TaggedLabeledYield();
                        StringBuilder     termYield = new StringBuilder();
                        StringBuilder     posYield  = new StringBuilder();
                        foreach (CoreLabel cl in yield)
                        {
                            termYield.Append(cl.Word()).Append(" ");
                            posYield.Append(cl.Tag()).Append(" ");
                        }
                        mweLabelToString.IncrementCount(label, termYield.ToString().Trim());
                        uniquePOSSequences.Add(posYield.ToString().Trim());
                    }
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n");
                double nMWEs          = mweLabelToString.TotalCount();
                int    nAllSingletons = 0;
                int    nTokens        = 0;
                foreach (string mweLabel in mweLabelToString.FirstKeySet())
                {
                    int               nSingletons = 0;
                    double            totalCount  = mweLabelToString.TotalCount(mweLabel);
                    ICounter <string> mc          = mweLabelToString.GetCounter(mweLabel);
                    foreach (string term in mc.KeySet())
                    {
                        if (mc.GetCount(term) == 1.0)
                        {
                            nSingletons++;
                        }
                        nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term);
                    }
                    nAllSingletons += nSingletons;
                    System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs);
                }
                System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs);
                System.Console.Out.WriteLine("#tokens = " + nTokens);
                System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (TregexParseException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }