/// <summary>
        /// Calculate sister annotation statistics suitable for doing
        /// selective sister splitting in the PCFGParser inside the
        /// FactoredParser.
        /// </summary>
        /// <param name="args">One argument: path to the Treebank</param>
        public static void Main(string[] args)
        {
            ClassicCounter <string> c = new ClassicCounter <string>();

            c.SetCount("A", 0);
            c.SetCount("B", 1);
            double d = Counters.KlDivergence(c, c);

            System.Console.Out.WriteLine("KL Divergence: " + d);
            string encoding = "UTF-8";

            if (args.Length > 1)
            {
                encoding = args[1];
            }
            if (args.Length < 1)
            {
                System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath");
            }
            else
            {
                SisterAnnotationStats pas = new SisterAnnotationStats();
                Treebank treebank         = new DiskTreebank(null, encoding);
                treebank.LoadPath(args[0]);
                treebank.Apply(pas);
                pas.PrintStats();
            }
        }
 private TreebankStats.ObservedCorpusStats GatherStats(DiskTreebank tb, string name)
 {
     TreebankStats.ObservedCorpusStats ocs = new TreebankStats.ObservedCorpusStats(name);
     if (makeVocab)
     {
         trainVocab = Generics.NewHashSet();
     }
     System.Console.Out.WriteLine("Reading treebank:");
     foreach (Tree t in tb)
     {
         Pair <int, int> treeFacts = DissectTree(t, ocs, makeVocab);
         ocs.AddStatsForTree(t.Yield().Count, treeFacts.First(), treeFacts.Second());
         if (ocs.numTrees % 100 == 0)
         {
             System.Console.Out.Write(".");
         }
         else
         {
             if (ocs.numTrees % 8001 == 0)
             {
                 System.Console.Out.WriteLine();
             }
         }
     }
     ocs.ComputeFinalValues();
     System.Console.Out.WriteLine("done!");
     return(ocs);
 }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, ArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            DiskTreebank tb            = null;
            string       encoding      = options.GetProperty("l", "UTF-8");
            bool         removeBracket = PropertiesUtils.GetBool(options, "b", false);

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            tb = tlpp.DiskTreebank();
            string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (files.Length != 0)
            {
                foreach (string filename in files)
                {
                    tb.LoadPath(filename);
                }
            }
            else
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            PrintWriter  pwo         = tlpp.Pw();
            string       startSymbol = tlpp.TreebankLanguagePack().StartSymbol();
            ITreeFactory tf          = new LabeledScoredTreeFactory();
            int          nTrees      = 0;

            foreach (Tree t in tb)
            {
                if (removeBracket)
                {
                    if (t.Value().Equals(startSymbol))
                    {
                        t = t.FirstChild();
                    }
                }
                else
                {
                    if (!t.Value().Equals(startSymbol))
                    {
                        //Add a bracket if it isn't already there
                        t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t));
                    }
                }
                pwo.Println(t.ToString());
                nTrees++;
            }
            pwo.Close();
            System.Console.Error.Printf("Processed %d trees.%n", nTrees);
        }
        /// <summary>Go through trees and determine their heads and print them.</summary>
        /// <remarks>
        /// Go through trees and determine their heads and print them.
        /// Just for debugging. <br />
        /// Usage: <code>
        /// java edu.stanford.nlp.trees.international.spanish.SpanishHeadFinder treebankFilePath
        /// </code>
        /// </remarks>
        /// <param name="args">The treebankFilePath</param>
        public static void Main(string[] args)
        {
            Treebank treebank = new DiskTreebank();

            CategoryWordTag.suppressTerminalDetails = true;
            treebank.LoadPath(args[0]);
            IHeadFinder chf = new Edu.Stanford.Nlp.Trees.International.Spanish.SpanishHeadFinder();

            treebank.Apply(new _ITreeVisitor_146(chf));
        }
Esempio n. 5
0
        /// <summary>Go through trees and determine their heads and print them.</summary>
        /// <remarks>
        /// Go through trees and determine their heads and print them.
        /// Just for debugging. <br />
        /// Usage: <code>
        /// java edu.stanford.nlp.trees.DybroFrenchHeadFinder treebankFilePath
        /// </code>
        /// </remarks>
        /// <param name="args">The treebankFilePath</param>
        public static void Main(string[] args)
        {
            Treebank treebank = new DiskTreebank();

            CategoryWordTag.suppressTerminalDetails = true;
            treebank.LoadPath(args[0]);
            IHeadFinder chf = new Edu.Stanford.Nlp.Trees.International.French.DybroFrenchHeadFinder();

            treebank.Apply(null);
        }
        public static void Main(string[] args)
        {
            // simple testing code
            Treebank treebank = new DiskTreebank();

            CategoryWordTag.suppressTerminalDetails = true;
            treebank.LoadPath(args[0]);
            IHeadFinder chf = new NoPunctuationHeadFinder();

            treebank.Apply(null);
        }
 public virtual void Run(bool pathsAreFiles, bool displayWords, bool displayOOV)
 {
     if (useSplit)
     {
         IList <TreebankStats.ObservedCorpusStats> allSplitStats = new List <TreebankStats.ObservedCorpusStats>();
         makeVocab = true;
         foreach (KeyValuePair <TreebankStats.Split, ICollection <string> > split in splitFileLists)
         {
             DiskTreebank tb          = tlpp.DiskTreebank();
             IFileFilter  splitFilter = new TreebankStats.SplitFilter(split.Value);
             foreach (string path in pathNames)
             {
                 tb.LoadPath(path, splitFilter);
             }
             TreebankStats.ObservedCorpusStats splitStats = GatherStats(tb, languageName.ToString() + "." + split.Key.ToString());
             allSplitStats.Add(splitStats);
             makeVocab = false;
         }
         Display(AggregateStats(allSplitStats), displayWords, displayOOV);
         foreach (TreebankStats.ObservedCorpusStats ocs in allSplitStats)
         {
             Display(ocs, displayWords, displayOOV);
         }
     }
     else
     {
         if (pathsAreFiles)
         {
             makeVocab = true;
             foreach (string path in pathNames)
             {
                 DiskTreebank tb = tlpp.DiskTreebank();
                 tb.LoadPath(path, null);
                 TreebankStats.ObservedCorpusStats stats = GatherStats(tb, languageName.ToString() + "  " + path);
                 Display(stats, displayWords, displayOOV);
                 makeVocab = false;
             }
         }
         else
         {
             trainVocab = Generics.NewHashSet();
             DiskTreebank tb = tlpp.DiskTreebank();
             foreach (string path in pathNames)
             {
                 tb.LoadPath(path, null);
             }
             TreebankStats.ObservedCorpusStats allStats = GatherStats(tb, languageName.ToString());
             Display(allStats, displayWords, displayOOV);
         }
     }
 }
Esempio n. 8
0
        /// <summary>
        /// Calculate parent annotation statistics suitable for doing
        /// selective parent splitting in the PCFGParser inside
        /// FactoredParser.
        /// </summary>
        /// <remarks>
        /// Calculate parent annotation statistics suitable for doing
        /// selective parent splitting in the PCFGParser inside
        /// FactoredParser.  <p>
        /// Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats
        /// [-tags] treebankPath
        /// </remarks>
        /// <param name="args">One argument: path to the Treebank</param>
        public static void Main(string[] args)
        {
            bool doTags = false;

            if (args.Length < 1)
            {
                System.Console.Out.WriteLine("Usage: java edu.stanford.nlp.parser.lexparser.ParentAnnotationStats [-tags] treebankPath");
            }
            else
            {
                int    i         = 0;
                bool   useCutOff = false;
                double cutOff    = 0.0;
                while (args[i].StartsWith("-"))
                {
                    if (args[i].Equals("-tags"))
                    {
                        doTags = true;
                        i++;
                    }
                    else
                    {
                        if (args[i].Equals("-cutOff") && i + 1 < args.Length)
                        {
                            useCutOff = true;
                            cutOff    = double.ParseDouble(args[i + 1]);
                            i        += 2;
                        }
                        else
                        {
                            log.Info("Unknown option: " + args[i]);
                            i++;
                        }
                    }
                }
                Treebank treebank = new DiskTreebank(null);
                treebank.LoadPath(args[i]);
                if (useCutOff)
                {
                    ICollection <string> splitters = GetSplitCategories(treebank, doTags, 0, cutOff, cutOff, null);
                    System.Console.Out.WriteLine(splitters);
                }
                else
                {
                    Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats pas = new Edu.Stanford.Nlp.Parser.Lexparser.ParentAnnotationStats(null, doTags);
                    treebank.Apply(pas);
                    pas.PrintStats();
                }
            }
        }
        public static IList <Tree> GetTrees(string path, int low, int high, int minLength, int maxLength)
        {
            Treebank treebank = new DiskTreebank(null);

            treebank.LoadPath(path, new NumberRangeFileFilter(low, high, true));
            IList <Tree> trees = new List <Tree>();

            foreach (Tree tree in treebank)
            {
                if (tree.Yield().Count <= maxLength && tree.Yield().Count >= minLength)
                {
                    trees.Add(tree);
                }
            }
            return(trees);
        }
Esempio n. 10
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank  tb        = null;
            string        encoding  = "UTF-8";
            TregexPattern rootMatch = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    rootMatch = TregexPattern.Compile("@" + args[i++]);
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i++]);
                }
            }
            ICounter <string> rhsCounter = new ClassicCounter <string>();

            foreach (Tree t in tb)
            {
                TregexMatcher m = rootMatch.Matcher(t);
                while (m.FindNextMatchingNode())
                {
                    Tree          match = m.GetMatch();
                    StringBuilder sb    = new StringBuilder();
                    foreach (Tree kid in match.Children())
                    {
                        sb.Append(kid.Value()).Append(" ");
                    }
                    rhsCounter.IncrementCount(sb.ToString().Trim());
                }
            }
            IList <string> biggestKeys = new List <string>(rhsCounter.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(rhsCounter));
            PrintWriter pw = tlpp.Pw();

            foreach (string rhs in biggestKeys)
            {
                pw.Printf("%s\t%d%n", rhs, (int)rhsCounter.GetCount(rhs));
            }
            pw.Close();
        }
Esempio n. 11
0
        /// <summary>Lets you test out the TreeBinarizer on the command line.</summary>
        /// <remarks>
        /// Lets you test out the TreeBinarizer on the command line.
        /// This main method doesn't yet handle as many flags as one would like.
        /// But it does have:
        /// <ul>
        /// <li> -tlp TreebankLanguagePack
        /// <li>-tlpp TreebankLangParserParams
        /// <li>-insideFactor
        /// <li>-markovOrder
        /// </ul>
        /// </remarks>
        /// <param name="args">
        /// Command line arguments: flags as above, as above followed by
        /// treebankPath
        /// </param>
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpp = null;
            // TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
            // Looks like it must build CategoryWordTagFactory!!
            ITreeReaderFactory    trf     = null;
            string                fileExt = "mrg";
            IHeadFinder           hf      = new ModCollinsHeadFinder();
            ITreebankLanguagePack tlp     = new PennTreebankLanguagePack();
            bool   insideFactor           = false;
            bool   mf               = false;
            int    mo               = 1;
            bool   uwl              = false;
            bool   uat              = false;
            double sst              = 20.0;
            bool   mfs              = false;
            bool   simpleLabels     = false;
            bool   noRebinarization = false;
            int    i = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length)
                {
                    try
                    {
                        tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                    }
                    catch (Exception e)
                    {
                        log.Info("Couldn't instantiate: " + args[i + 1]);
                        throw new Exception(e);
                    }
                    i++;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length)
                    {
                        try
                        {
                            tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                        }
                        catch (Exception e)
                        {
                            log.Info("Couldn't instantiate: " + args[i + 1]);
                            throw new Exception(e);
                        }
                        i++;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor"))
                        {
                            insideFactor = true;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length)
                            {
                                i++;
                                mo = System.Convert.ToInt32(args[i]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels"))
                                {
                                    simpleLabels = true;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization"))
                                    {
                                        noRebinarization = true;
                                    }
                                    else
                                    {
                                        log.Info("Unknown option:" + args[i]);
                                    }
                                }
                            }
                        }
                    }
                }
                i++;
            }
            if (i >= args.Length)
            {
                log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath");
                System.Environment.Exit(0);
            }
            Treebank treebank;

            if (tlpp != null)
            {
                treebank = tlpp.MemoryTreebank();
                tlp      = tlpp.TreebankLanguagePack();
                fileExt  = tlp.TreebankFileExtension();
                hf       = tlpp.HeadFinder();
            }
            else
            {
                treebank = new DiskTreebank(trf);
            }
            treebank.LoadPath(args[i], fileExt, true);
            ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization);

            foreach (Tree t in treebank)
            {
                Tree newT = tt.TransformTree(t);
                System.Console.Out.WriteLine("Original tree:");
                t.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                newT.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
Esempio n. 12
0
        /// <summary>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.
        /// </summary>
        /// <remarks>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.  What's here currently probably
        /// only works for the English Penn Treeebank, as it uses default
        /// constructors.  Of the words given to test on,
        /// the first is treated as sentence initial, and the rest as not
        /// sentence initial.
        /// </remarks>
        /// <param name="args">
        /// The command line arguments:
        /// java BaseLexicon treebankPath fileRange unknownWordModel words
        /// </param>
        public static void Main(string[] args)
        {
            if (args.Length < 3)
            {
                log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
                return;
            }
            System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
            Treebank tb = new DiskTreebank();

            tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true));
            // TODO: change this interface so the lexicon creates its own indices?
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            Options         op        = new Options();

            op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]);
            Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex);
            lex.InitializeTraining(tb.Count);
            lex.Train(tb);
            lex.FinishTraining();
            System.Console.Out.WriteLine("done.");
            System.Console.Out.WriteLine();
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(4);
            IList <string> impos = new List <string>();

            for (int i = 3; i < args.Length; i++)
            {
                if (lex.IsKnown(args[i]))
                {
                    System.Console.Out.WriteLine(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
                    for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();)
                    {
                        IntTaggedWord iTW = it.Current;
                        System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null)));
                    }
                }
                else
                {
                    string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3);
                    System.Console.Out.WriteLine(args[i] + " is an unknown word.  Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
                    impos.Clear();
                    IList <string> lis = new List <string>(tagIndex.ObjectsList());
                    lis.Sort();
                    foreach (string tStr in lis)
                    {
                        IntTaggedWord iTW   = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                        double        score = lex.Score(iTW, 1, args[i], null);
                        if (score == float.NegativeInfinity)
                        {
                            impos.Add(tStr);
                        }
                        else
                        {
                            System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score));
                        }
                    }
                    if (impos.Count > 0)
                    {
                        System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos);
                    }
                }
                System.Console.Out.WriteLine();
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank tb       = null;
            string       encoding = "UTF-8";
            string       puncTag  = null;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        Language lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    puncTag = args[i++];
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i]);
                }
            }
            ICounter <string> puncTypes = new ClassicCounter <string>();

            foreach (Tree t in tb)
            {
                IList <CoreLabel> yield = t.TaggedLabeledYield();
                foreach (CoreLabel word in yield)
                {
                    if (word.Tag().Equals(puncTag))
                    {
                        puncTypes.IncrementCount(word.Word());
                    }
                }
            }
            IList <string> biggestKeys = new List <string>(puncTypes.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(puncTypes));
            PrintWriter pw = tlpp.Pw();

            foreach (string wordType in biggestKeys)
            {
                pw.Printf("%s\t%d%n", wordType, (int)puncTypes.GetCount(wordType));
            }
            pw.Close();
        }
Esempio n. 14
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            // Process command-line options
            Properties options  = StringUtils.ArgsToProperties(args, optionArgDefinitions);
            string     fileName = options.GetProperty(string.Empty);

            if (fileName == null || fileName.Equals(string.Empty))
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            string encoding = options.GetProperty("e", "UTF-8");

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            DiskTreebank tb = tlpp.DiskTreebank();

            tb.LoadPath(fileName);
            // Statistics
            ICounter <string> binaryRuleTypes  = new ClassicCounter <string>(20000);
            IList <int>       branchingFactors = new List <int>(20000);
            int nTrees                 = 0;
            int nUnaryRules            = 0;
            int nBinaryRules           = 0;
            int binaryBranchingFactors = 0;
            // Read the treebank
            PrintWriter pw = tlpp.Pw();

            foreach (Tree tree in tb)
            {
                if (tree.Value().Equals("ROOT"))
                {
                    tree = tree.FirstChild();
                }
                ++nTrees;
                foreach (Tree subTree in tree)
                {
                    if (subTree.IsPhrasal())
                    {
                        if (subTree.NumChildren() > 1)
                        {
                            ++nBinaryRules;
                            branchingFactors.Add(subTree.NumChildren());
                            binaryBranchingFactors += subTree.NumChildren();
                            binaryRuleTypes.IncrementCount(TreeToRuleString(subTree));
                        }
                        else
                        {
                            ++nUnaryRules;
                        }
                    }
                }
            }
            double mean = (double)binaryBranchingFactors / (double)nBinaryRules;

            System.Console.Out.Printf("#trees:\t%d%n", nTrees);
            System.Console.Out.Printf("#binary:\t%d%n", nBinaryRules);
            System.Console.Out.Printf("#binary types:\t%d%n", binaryRuleTypes.KeySet().Count);
            System.Console.Out.Printf("mean branching:\t%.4f%n", mean);
            System.Console.Out.Printf("stddev branching:\t%.4f%n", StandardDeviation(branchingFactors, mean));
            System.Console.Out.Printf("rule entropy:\t%.5f%n", Counters.Entropy(binaryRuleTypes));
            System.Console.Out.Printf("#unaries:\t%d%n", nUnaryRules);
        }
 public ATBArabicDataset()
     : base()
 {
     //Read the raw file as UTF-8 irrespective of output encoding
     treebank = new DiskTreebank(new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true), "UTF-8");
 }
Esempio n. 16
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank tb       = null;
            string       encoding = "UTF-8";
            Language     lang     = Language.English;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i]);
                }
            }
            PrintWriter pw = tlpp.Pw();
            Options     op = new Options();

            Options.LexOptions lexOptions = op.lexOptions;
            if (lang == Language.French)
            {
                lexOptions.useUnknownWordSignatures = 1;
                lexOptions.smartMutation            = false;
                lexOptions.unknownSuffixSize        = 2;
                lexOptions.unknownPrefixSize        = 1;
            }
            else
            {
                if (lang == Language.Arabic)
                {
                    lexOptions.smartMutation            = false;
                    lexOptions.useUnknownWordSignatures = 9;
                    lexOptions.unknownPrefixSize        = 1;
                    lexOptions.unknownSuffixSize        = 1;
                }
            }
            IIndex <string>   wordIndex    = new HashIndex <string>();
            IIndex <string>   tagIndex     = new HashIndex <string>();
            ILexicon          lex          = tlpp.Lex(op, wordIndex, tagIndex);
            int               computeAfter = (int)(0.50 * tb.Count);
            ICounter <string> vocab        = new ClassicCounter <string>();
            ICounter <string> unkCounter   = new ClassicCounter <string>();
            int               treeId       = 0;

            foreach (Tree t in tb)
            {
                IList <ILabel> yield = t.Yield();
                int            posId = 0;
                foreach (ILabel word in yield)
                {
                    vocab.IncrementCount(word.Value());
                    if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0)
                    {
                        //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                        //            pw.println(word.value());
                        unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++));
                    }
                }
                treeId++;
            }
            IList <string> biggestKeys = new List <string>(unkCounter.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter));
            foreach (string wordType in biggestKeys)
            {
                pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType));
            }
            pw.Close();
            pw.Close();
        }
Esempio n. 17
0
        // = false;
        // not an instantiable class
        /// <summary>Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ...</summary>
        /// <remarks>
        /// Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
        /// <h4>Arguments:</h4>
        /// Each argument should be the name of a transformation file that contains a list of pattern
        /// and transformation operation list pairs.  That is, it is a sequence of pairs of a
        /// <see cref="Edu.Stanford.Nlp.Trees.Tregex.TregexPattern"/>
        /// pattern on one or more lines, then a
        /// blank line (empty or whitespace), then a list of transformation operations one per line
        /// (as specified by <b>Legal operation syntax</b> below) to apply when the pattern is matched,
        /// and then another blank line (empty or whitespace).
        /// Note the need for blank lines: The code crashes if they are not present as separators
        /// (although the blank line at the end of the file can be omitted).
        /// The script file can include comment lines, either whole comment lines or
        /// trailing comments introduced by %, which extend to the end of line.  A needed percent
        /// mark can be escaped by a preceding backslash.
        /// <p>
        /// For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
        /// and relabel the SQ node to S, your transformation file would look like this:
        /// <blockquote>
        /// <code>
        /// SBARQ=n1 &lt; SQ=n2<br />
        /// <br />
        /// excise n1 n1<br />
        /// relabel n2 S
        /// </code>
        /// </blockquote>
        /// <h4>Options:</h4>
        /// <ul>
        /// <li>
        /// <c>-treeFile &lt;filename&gt;</c>
        /// specify the name of the file that has the trees you want to transform.
        /// <li>
        /// <c>-po &lt;matchPattern&gt; &lt;operation&gt;</c>
        /// Apply a single operation to every tree using the specified match pattern and the specified operation.  Use this option
        /// when you want to quickly try the effect of one pattern/surgery combination, and are too lazy to write a transformation file.
        /// <li>
        /// <c>-s</c>
        /// Print each output tree on one line (default is pretty-printing).
        /// <li>
        /// <c>-m</c>
        /// For every tree that had a matching pattern, print "before" (prepended as "Operated on:") and "after" (prepended as "Result:").  Unoperated on trees just pass through the transducer as usual.
        /// <li>
        /// <c>-encoding X</c>
        /// Uses character set X for input and output of trees.
        /// <li>
        /// <c>-macros &lt;filename&gt;</c>
        /// A file of macros to use on the tregex pattern.  Macros should be one per line, with original and replacement separated by tabs.
        /// <li>
        /// <c>-hf &lt;headFinder-class-name&gt;</c>
        /// use the specified
        /// <see cref="Edu.Stanford.Nlp.Trees.IHeadFinder"/>
        /// class to determine headship relations.
        /// <li>
        /// <c>-hfArg &lt;string&gt;</c>
        /// pass a string argument in to the
        /// <see cref="Edu.Stanford.Nlp.Trees.IHeadFinder"/>
        /// class's constructor.
        /// <c>-hfArg</c>
        /// can be used multiple times to pass in multiple arguments.
        /// <li>
        /// <c>-trf &lt;TreeReaderFactory-class-name&gt;</c>
        /// use the specified
        /// <see cref="Edu.Stanford.Nlp.Trees.ITreeReaderFactory"/>
        /// class to read trees from files.
        /// </ul>
        /// <h4>Legal operation syntax:</h4>
        /// <ul>
        /// <li>
        /// <c>delete &lt;name&gt;</c>
        /// deletes the node and everything below it.
        /// <li>
        /// <c>prune &lt;name&gt;</c>
        /// Like delete, but if, after the pruning, the parent has no children anymore, the parent is pruned too.  Pruning continues to affect all ancestors until one is found with remaining children.  This may result in a null tree.
        /// <li>
        /// <c>excise &lt;name1&gt; &lt;name2&gt;</c>
        /// The name1 node should either dominate or be the same as the name2 node.  This excises out everything from
        /// name1 to name2.  All the children of name2 go into the parent of name1, where name1 was.
        /// <li>
        /// <c>relabel &lt;name&gt; &lt;new-label&gt;</c>
        /// Relabels the node to have the new label. <br />
        /// There are three possible forms: <br />
        /// <c>relabel nodeX VP</c>
        /// - for changing a node label to an
        /// alphanumeric string <br />
        /// <c>relabel nodeX /''/</c>
        /// - for relabeling a node to
        /// something that isn't a valid identifier without quoting <br />
        /// <c>relabel nodeX /^VB(.*)$/verb\\/$1/</c>
        /// - for regular
        /// expression based relabeling. In this case, all matches of the
        /// regular expression against the node label are replaced with the
        /// replacement String.  This has the semantics of Java/Perl's
        /// replaceAll: you may use capturing groups and put them in
        /// replacements with $n. For example, if the pattern is /foo/bar/
        /// and the node matched is "foo", the replaceAll semantics result in
        /// "barbar".  If the pattern is /^foo(.*)$/bar$1/ and node matched is
        /// "foofoo", relabel will result in "barfoo".  <br />
        /// When using the regex replacement method, you can also use the
        /// sequences ={node} and %{var} in the replacement string to use
        /// captured nodes or variable strings in the replacement string.
        /// For example, if the Tregex pattern was "duck=bar" and the relabel
        /// is /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br />
        /// To concatenate two nodes named in the tregex pattern, for
        /// example, you can use the pattern /^.*$/={foo}={bar}/.  Note that
        /// the ^.*$ is necessary to make sure the regex pattern only matches
        /// and replaces once on the entire node name. <br />
        /// To get an "=" or a "%" in the replacement, using \ escaping.
        /// Also, as in the example you can escape a slash in the middle of
        /// the second and third forms with \\/ and \\\\. <br />
        /// <li>
        /// <c>insert &lt;name&gt; &lt;position&gt;</c>
        /// or
        /// <c>insert &lt;tree&gt; &lt;position&gt;</c>
        /// inserts the named node or tree into the position specified.
        /// <li>
        /// <c>move &lt;name&gt; &lt;position&gt;</c>
        /// moves the named node into the specified position.
        /// <p>Right now the  only ways to specify position are:
        /// <p>
        /// <c>$+ &lt;name&gt;</c>
        /// the left sister of the named node<br />
        /// <c>$- &lt;name&gt;</c>
        /// the right sister of the named node<br />
        /// <c>&gt;i &lt;name&gt;</c>
        /// the i_th daughter of the named node<br />
        /// <c>&gt;-i &lt;name&gt;</c>
        /// the i_th daughter, counting from the right, of the named node.
        /// <li>
        /// <c>replace &lt;name1&gt; &lt;name2&gt;</c>
        /// deletes name1 and inserts a copy of name2 in its place.
        /// <li>
        /// <c>replace &lt;name&gt; &lt;tree&gt; &lt;tree2&gt;...</c>
        /// deletes name and inserts the new tree(s) in its place.  If
        /// more than one replacement tree is given, each of the new
        /// subtrees will be added in order where the old tree was.
        /// Multiple subtrees at the root is an illegal operation and
        /// will throw an exception.
        /// <li>
        /// <c>createSubtree &lt;auxiliary-tree-or-label&gt; &lt;name1&gt; [&lt;name2&gt;]</c>
        /// Create a subtree out of all the nodes from
        /// <c>&lt;name1&gt;</c>
        /// through
        /// <c>&lt;name2&gt;</c>
        /// . The subtree is moved to the foot of the given
        /// auxiliary tree, and the tree is inserted where the nodes of
        /// the subtree used to reside. If a simple label is provided as
        /// the first argument, the subtree is given a single parent with
        /// a name corresponding to the label.  To limit the operation to
        /// just one node, elide
        /// <c>&lt;name2&gt;</c>
        /// .
        /// <li>
        /// <c>adjoin &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Adjoins the specified auxiliary tree into the named node.
        /// The daughters of the target node will become the daughters of the foot of the auxiliary tree.
        /// <li>
        /// <c>adjoinH &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Similar to adjoin, but preserves the target node
        /// and makes it the root of
        /// <c>&lt;tree&gt;</c>
        /// . (It is still accessible as
        /// <c>name</c>
        /// .  The root of the
        /// auxiliary tree is ignored.)
        /// <li>
        /// <c>adjoinF &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Similar to adjoin,
        /// but preserves the target node and makes it the foot of
        /// <c>&lt;tree&gt;</c>
        /// .
        /// (It is still accessible as
        /// <c>name</c>
        /// , and retains its status as parent of its children.
        /// The root of the auxiliary tree is ignored.)
        /// <li> <dt>
        /// <c>coindex &lt;name1&gt; &lt;name2&gt; ... &lt;nameM&gt;</c>
        /// Puts a (Penn Treebank style)
        /// coindexation suffix of the form "-N" on each of nodes name_1 through name_m.  The value of N will be
        /// automatically generated in reference to the existing coindexations in the tree, so that there is never
        /// an accidental clash of indices across things that are not meant to be coindexed.
        /// </ul>
        /// <p>
        /// In the context of
        /// <c>adjoin</c>
        /// ,
        /// <c>adjoinH</c>
        /// ,
        /// <c>adjoinF</c>
        /// , and
        /// <c>createSubtree</c>
        /// , an auxiliary
        /// tree is a tree in Penn Treebank format with
        /// <c>@</c>
        /// on
        /// exactly one of the leaves denoting the foot of the tree.
        /// The operations which use the foot use the labeled node.
        /// For example:
        /// </p>
        /// <blockquote>
        /// Tsurgeon:
        /// <c>adjoin (FOO (BAR@)) foo</c>
        /// <br />
        /// Tregex:
        /// <c>B=foo</c>
        /// <br />
        /// Input:
        /// <c>(A (B 1 2))</c>
        /// Output:
        /// <c>(A (FOO (BAR 1 2)))</c>
        /// </blockquote>
        /// <p>
        /// Tsurgeon applies the same operation to the same tree for as long
        /// as the given tregex operation matches.  This means that infinite
        /// loops are very easy to cause.  One common situation where this comes up
        /// is with an insert operation will repeats infinitely many times
        /// unless you add an expression to the tregex that matches against
        /// the inserted pattern.  For example, this pattern will infinite loop:
        /// </p>
        /// <blockquote>
        /// <code>
        /// TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br />
        /// TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
        /// </code>
        /// </blockquote>
        /// <p>
        /// This pattern, though, will terminate:
        /// </p>
        /// <blockquote>
        /// <code>
        /// TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br />
        /// TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
        /// </code>
        /// </blockquote>
        /// <p>
        /// Tsurgeon has (very) limited support for conditional statements.
        /// If a pattern is prefaced with
        /// <c>if exists &lt;name&gt;</c>
        /// ,
        /// the rest of the pattern will only execute if
        /// the named node was found in the corresponding TregexMatcher.
        /// </p>
        /// </remarks>
        /// <param name="args">
        /// a list of names of files each of which contains a single tregex matching pattern plus a list, one per line,
        /// of transformation operations to apply to the matched pattern.
        /// </param>
        /// <exception cref="System.Exception">If an I/O or pattern syntax error</exception>
        public static void Main(string[] args)
        {
            string headFinderClassName = null;
            string headFinderOption    = "-hf";

            string[] headFinderArgs      = null;
            string   headFinderArgOption = "-hfArg";
            string   encoding            = "UTF-8";
            string   encodingOption      = "-encoding";

            if (args.Length == 0)
            {
                log.Info("Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
                System.Environment.Exit(0);
            }
            string treePrintFormats;
            string singleLineOption = "-s";
            string verboseOption    = "-v";
            string matchedOption    = "-m";
            // if set, then print original form of trees that are matched & thus operated on
            string patternOperationOption = "-po";
            string treeFileOption         = "-treeFile";
            string trfOption     = "-trf";
            string macroOption   = "-macros";
            string macroFilename = string.Empty;
            IDictionary <string, int> flagMap = Generics.NewHashMap();

            flagMap[patternOperationOption] = 2;
            flagMap[treeFileOption]         = 1;
            flagMap[trfOption]        = 1;
            flagMap[singleLineOption] = 0;
            flagMap[encodingOption]   = 1;
            flagMap[headFinderOption] = 1;
            flagMap[macroOption]      = 1;
            IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, flagMap);

            args = argsMap[null];
            if (argsMap.Contains(headFinderOption))
            {
                headFinderClassName = argsMap[headFinderOption][0];
            }
            if (argsMap.Contains(headFinderArgOption))
            {
                headFinderArgs = argsMap[headFinderArgOption];
            }
            if (argsMap.Contains(verboseOption))
            {
                verbose = true;
            }
            if (argsMap.Contains(singleLineOption))
            {
                treePrintFormats = "oneline,";
            }
            else
            {
                treePrintFormats = "penn,";
            }
            if (argsMap.Contains(encodingOption))
            {
                encoding = argsMap[encodingOption][0];
            }
            if (argsMap.Contains(macroOption))
            {
                macroFilename = argsMap[macroOption][0];
            }
            TreePrint          tp    = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
            PrintWriter        pwOut = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true);
            ITreeReaderFactory trf;

            if (argsMap.Contains(trfOption))
            {
                string trfClass = argsMap[trfOption][0];
                trf = ReflectionLoading.LoadByReflection(trfClass);
            }
            else
            {
                trf = new TregexPattern.TRegexTreeReaderFactory();
            }
            Treebank trees = new DiskTreebank(trf, encoding);

            if (argsMap.Contains(treeFileOption))
            {
                trees.LoadPath(argsMap[treeFileOption][0]);
            }
            if (trees.IsEmpty())
            {
                log.Info("Warning: No trees specified to operate on.  Use -treeFile path option.");
            }
            TregexPatternCompiler compiler;

            if (headFinderClassName == null)
            {
                compiler = new TregexPatternCompiler();
            }
            else
            {
                IHeadFinder hf;
                if (headFinderArgs == null)
                {
                    hf = ReflectionLoading.LoadByReflection(headFinderClassName);
                }
                else
                {
                    hf = ReflectionLoading.LoadByReflection(headFinderClassName, (object[])headFinderArgs);
                }
                compiler = new TregexPatternCompiler(hf);
            }
            Macros.AddAllMacros(compiler, macroFilename, encoding);
            IList <Pair <TregexPattern, TsurgeonPattern> > ops = new List <Pair <TregexPattern, TsurgeonPattern> >();

            if (argsMap.Contains(patternOperationOption))
            {
                TregexPattern   matchPattern = compiler.Compile(argsMap[patternOperationOption][0]);
                TsurgeonPattern p            = ParseOperation(argsMap[patternOperationOption][1]);
                ops.Add(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, p));
            }
            else
            {
                foreach (string arg in args)
                {
                    IList <Pair <TregexPattern, TsurgeonPattern> > pairs = GetOperationsFromFile(arg, encoding, compiler);
                    foreach (Pair <TregexPattern, TsurgeonPattern> pair in pairs)
                    {
                        if (verbose)
                        {
                            log.Info(pair.Second());
                        }
                        ops.Add(pair);
                    }
                }
            }
            foreach (Tree t in trees)
            {
                Tree original = t.DeepCopy();
                Tree result   = ProcessPatternsOnTree(ops, t);
                if (argsMap.Contains(matchedOption) && matchedOnTree)
                {
                    pwOut.Println("Operated on: ");
                    DisplayTree(original, tp, pwOut);
                    pwOut.Println("Result: ");
                }
                DisplayTree(result, tp, pwOut);
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            // Process command-line options
            Properties options  = StringUtils.ArgsToProperties(args, optionArgDefinitions);
            string     fileName = options.GetProperty(string.Empty);

            if (fileName == null || fileName.Equals(string.Empty))
            {
                System.Console.Out.WriteLine(usage);
                System.Environment.Exit(-1);
            }
            int      maxLen                = PropertiesUtils.GetInt(options, "y", int.MaxValue);
            bool     printTrees            = PropertiesUtils.GetBool(options, "p", false);
            bool     flattenTrees          = PropertiesUtils.GetBool(options, "f", false);
            bool     printPOS              = PropertiesUtils.GetBool(options, "a", false);
            bool     printTnT              = PropertiesUtils.GetBool(options, "t", false);
            Language language              = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            string encoding                = options.GetProperty("e", "UTF-8");

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            DiskTreebank tb = tlpp.DiskTreebank();

            tb.LoadPath(fileName);
            // Read the treebank
            PrintWriter pw       = tlpp.Pw();
            int         numTrees = 0;

            foreach (Tree tree in tb)
            {
                if (tree.Yield().Count > maxLen)
                {
                    continue;
                }
                ++numTrees;
                if (printTrees)
                {
                    pw.Println(tree.ToString());
                }
                else
                {
                    if (flattenTrees)
                    {
                        pw.Println(SentenceUtils.ListToString(tree.Yield()));
                    }
                    else
                    {
                        if (printPOS)
                        {
                            pw.Println(SentenceUtils.ListToString(tree.PreTerminalYield()));
                        }
                        else
                        {
                            if (printTnT)
                            {
                                IList <CoreLabel> yield = tree.TaggedLabeledYield();
                                foreach (CoreLabel label in yield)
                                {
                                    pw.Printf("%s\t%s%n", label.Word(), label.Tag());
                                }
                                pw.Println();
                            }
                        }
                    }
                }
            }
            System.Console.Error.Printf("Read %d trees.%n", numTrees);
        }