示例#1
0
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpParams = new ChineseTreebankParserParams();
            ITreebankLanguagePack     ctlp      = tlpParams.TreebankLanguagePack();
            Options       op = new Options(tlpParams);
            TreeAnnotator ta = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);

            log.Info("Reading Trees...");
            IFileFilter trainFilter   = new NumberRangesFileFilter(args[1], true);
            Treebank    trainTreebank = tlpParams.MemoryTreebank();

            trainTreebank.LoadPath(args[0], trainFilter);
            log.Info("Annotating trees...");
            ICollection <Tree> trainTrees = new List <Tree>();

            foreach (Tree tree in trainTreebank)
            {
                trainTrees.Add(ta.TransformTree(tree));
            }
            trainTreebank = null;
            // saves memory
            log.Info("Training lexicon...");
            IIndex <string> wordIndex    = new HashIndex <string>();
            IIndex <string> tagIndex     = new HashIndex <string>();
            int             featureLevel = DefaultFeatureLevel;

            if (args.Length > 3)
            {
                featureLevel = System.Convert.ToInt32(args[3]);
            }
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseMaxentLexicon(op, wordIndex, tagIndex, featureLevel);
            lex.InitializeTraining(trainTrees.Count);
            lex.Train(trainTrees);
            lex.FinishTraining();
            log.Info("Testing");
            IFileFilter testFilter   = new NumberRangesFileFilter(args[2], true);
            Treebank    testTreebank = tlpParams.MemoryTreebank();

            testTreebank.LoadPath(args[0], testFilter);
            IList <TaggedWord> testWords = new List <TaggedWord>();

            foreach (Tree t in testTreebank)
            {
                foreach (TaggedWord tw in t.TaggedYield())
                {
                    testWords.Add(tw);
                }
            }
            //testWords.addAll(t.taggedYield());
            int[] totalAndCorrect = lex.TestOnTreebank(testWords);
            log.Info("done.");
            System.Console.Out.WriteLine(totalAndCorrect[1] + " correct out of " + totalAndCorrect[0] + " -- ACC: " + ((double)totalAndCorrect[1]) / totalAndCorrect[0]);
        }
示例#2
0
        public static Triple <string, IFileFilter, double> GetWeightedTreebankDescription(string[] args, int argIndex, string flag)
        {
            string      path   = null;
            IFileFilter filter = null;
            double      weight = 1.0;
            // the next arguments are the treebank path and maybe the range for testing
            int numSubArgs = NumSubArgs(args, argIndex);

            if (numSubArgs > 0 && numSubArgs < 4)
            {
                argIndex++;
                path = args[argIndex++];
                bool hasWeight = false;
                if (numSubArgs > 1 && DoublePattern.Matcher(args[argIndex + numSubArgs - 2]).Matches())
                {
                    weight    = double.Parse(args[argIndex + numSubArgs - 2]);
                    hasWeight = true;
                    numSubArgs--;
                }
                if (numSubArgs == 2)
                {
                    filter = new NumberRangesFileFilter(args[argIndex++], true);
                }
                else
                {
                    if (numSubArgs == 3)
                    {
                        try
                        {
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            filter    = new NumberRangeFileFilter(low, high, true);
                            argIndex += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            filter = new NumberRangesFileFilter(args[argIndex++], true);
                        }
                    }
                }
                if (hasWeight)
                {
                    argIndex++;
                }
            }
            else
            {
                throw new ArgumentException("Bad arguments after " + flag);
            }
            return(Triple.MakeTriple(path, filter, weight));
        }
 private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate
                          <Tree> treeFilter, int wordColumn, int tagColumn)
 {
     // represents a tokenized file separated by text
     // represents a tsv file such as a conll file
     // represents a file in PTB format
     this.file            = file;
     this.format          = format;
     this.encoding        = encoding;
     this.tagSeparator    = tagSeparator;
     this.treeTransformer = treeTransformer;
     this.treeNormalizer  = treeNormalizer;
     this.treeRange       = treeRange;
     this.treeFilter      = treeFilter;
     this.wordColumn      = wordColumn;
     this.tagColumn       = tagColumn;
     this.trf             = trf;
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-parser"]        = int.Parse(3);
            flagsToNumArgs["-lex"]           = int.Parse(3);
            flagsToNumArgs["-test"]          = int.Parse(2);
            flagsToNumArgs["-out"]           = int.Parse(1);
            flagsToNumArgs["-lengthPenalty"] = int.Parse(1);
            flagsToNumArgs["-penaltyType"]   = int.Parse(1);
            flagsToNumArgs["-maxLength"]     = int.Parse(1);
            flagsToNumArgs["-stats"]         = int.Parse(2);
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);
            bool        eval = argMap.Contains("-eval");
            PrintWriter pw   = null;

            if (argMap.Contains("-out"))
            {
                pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true);
            }
            log.Info("ChineseCharacterBasedLexicon called with args:");
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            for (int i = 0; i < args.Length; i++)
            {
                ctpp.SetOptionFlag(args, i);
                log.Info(" " + args[i]);
            }
            log.Info();
            Options op = new Options(ctpp);

            if (argMap.Contains("-stats"))
            {
                string[]       statArgs         = (argMap["-stats"]);
                MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    trainFilt        = new NumberRangesFileFilter(statArgs[1], false);
                rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt);
                log.Info("Done reading trees.");
                MemoryTreebank trainTreebank;
                if (argMap.Contains("-annotate"))
                {
                    trainTreebank = new MemoryTreebank();
                    TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                    foreach (Tree tree in rawTrainTreebank)
                    {
                        trainTreebank.Add(annotator.TransformTree(tree));
                    }
                    log.Info("Done annotating trees.");
                }
                else
                {
                    trainTreebank = rawTrainTreebank;
                }
                PrintStats(trainTreebank, pw);
                System.Environment.Exit(0);
            }
            int maxLength = 1000000;

            //    Test.verbose = true;
            if (argMap.Contains("-norm"))
            {
                op.testOptions.lengthNormalization = true;
            }
            if (argMap.Contains("-maxLength"))
            {
                maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]);
            }
            op.testOptions.maxLength = 120;
            bool combo = argMap.Contains("-combo");

            if (combo)
            {
                ctpp.useCharacterBasedLexicon = true;
                op.testOptions.maxSpanForTags = 10;
                op.doDep  = false;
                op.dcTags = false;
            }
            LexicalizedParser lp  = null;
            ILexicon          lex = null;

            if (argMap.Contains("-parser"))
            {
                string[] parserArgs = (argMap["-parser"]);
                if (parserArgs.Length > 1)
                {
                    IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
                    lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op);
                    if (parserArgs.Length == 3)
                    {
                        string filename = parserArgs[2];
                        log.Info("Writing parser in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lp);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string parserFile = parserArgs[0];
                    lp = LexicalizedParser.LoadModel(parserFile, op);
                }
                lex  = lp.GetLexicon();
                op   = lp.GetOp();
                ctpp = (ChineseTreebankParserParams)op.tlpParams;
            }
            if (argMap.Contains("-rad"))
            {
                ctpp.useUnknownCharacterModel = true;
            }
            if (argMap.Contains("-lengthPenalty"))
            {
                ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]);
            }
            if (argMap.Contains("-penaltyType"))
            {
                ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]);
            }
            if (argMap.Contains("-lex"))
            {
                string[] lexArgs = (argMap["-lex"]);
                if (lexArgs.Length > 1)
                {
                    IIndex <string> wordIndex = new HashIndex <string>();
                    IIndex <string> tagIndex  = new HashIndex <string>();
                    lex = ctpp.Lex(op, wordIndex, tagIndex);
                    MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                    IFileFilter    trainFilt        = new NumberRangesFileFilter(lexArgs[1], false);
                    rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt);
                    log.Info("Done reading trees.");
                    MemoryTreebank trainTreebank;
                    if (argMap.Contains("-annotate"))
                    {
                        trainTreebank = new MemoryTreebank();
                        TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                        foreach (Tree tree in rawTrainTreebank)
                        {
                            tree = annotator.TransformTree(tree);
                            trainTreebank.Add(tree);
                        }
                        log.Info("Done annotating trees.");
                    }
                    else
                    {
                        trainTreebank = rawTrainTreebank;
                    }
                    lex.InitializeTraining(trainTreebank.Count);
                    lex.Train(trainTreebank);
                    lex.FinishTraining();
                    log.Info("Done training lexicon.");
                    if (lexArgs.Length == 3)
                    {
                        string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                        log.Info("Writing lexicon in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lex);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
                    log.Info("Reading Lexicon from file " + lexFile);
                    ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile);
                    try
                    {
                        lex = (ILexicon)@in.ReadObject();
                    }
                    catch (TypeLoadException)
                    {
                        throw new Exception("Bad serialized file: " + lexFile);
                    }
                    @in.Close();
                }
            }
            if (argMap.Contains("-test"))
            {
                bool segmentWords = ctpp.segment;
                bool parse        = lp != null;
                System.Diagnostics.Debug.Assert((parse || segmentWords));
                //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
                //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
                IWordSegmenter seg = null;
                if (segmentWords)
                {
                    seg = (IWordSegmenter)lex;
                }
                string[]       testArgs     = (argMap["-test"]);
                MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    testFilt     = new NumberRangesFileFilter(testArgs[1], false);
                testTreebank.LoadPath(new File(testArgs[0]), testFilt);
                ITreeTransformer          subcategoryStripper = op.tlpParams.SubcategoryStripper();
                ITreeTransformer          collinizer          = ctpp.Collinizer();
                WordCatEquivalenceClasser eqclass             = new WordCatEquivalenceClasser();
                WordCatEqualityChecker    eqcheck             = new WordCatEqualityChecker();
                EquivalenceClassEval      basicEval           = new EquivalenceClassEval(eqclass, eqcheck, "basic");
                EquivalenceClassEval      collinsEval         = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
                IList <string>            evalTypes           = new List <string>(3);
                bool goodPOS = false;
                if (segmentWords)
                {
                    evalTypes.Add(WordCatConstituent.wordType);
                    if (ctpp.segmentMarkov && !parse)
                    {
                        evalTypes.Add(WordCatConstituent.tagType);
                        goodPOS = true;
                    }
                }
                if (parse)
                {
                    evalTypes.Add(WordCatConstituent.tagType);
                    evalTypes.Add(WordCatConstituent.catType);
                    if (combo)
                    {
                        evalTypes.Add(WordCatConstituent.wordType);
                        goodPOS = true;
                    }
                }
                TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
                log.Info("Testing...");
                foreach (Tree goldTop in testTreebank)
                {
                    Tree             gold         = goldTop.FirstChild();
                    IList <IHasWord> goldSentence = gold.YieldHasWord();
                    if (goldSentence.Count > maxLength)
                    {
                        log.Info("Skipping sentence; too long: " + goldSentence.Count);
                        continue;
                    }
                    else
                    {
                        log.Info("Processing sentence; length: " + goldSentence.Count);
                    }
                    IList <IHasWord> s;
                    if (segmentWords)
                    {
                        StringBuilder goldCharBuf = new StringBuilder();
                        foreach (IHasWord aGoldSentence in goldSentence)
                        {
                            StringLabel word = (StringLabel)aGoldSentence;
                            goldCharBuf.Append(word.Value());
                        }
                        string goldChars = goldCharBuf.ToString();
                        s = seg.Segment(goldChars);
                    }
                    else
                    {
                        s = goldSentence;
                    }
                    Tree tree;
                    if (parse)
                    {
                        tree = lp.ParseTree(s);
                        if (tree == null)
                        {
                            throw new Exception("PARSER RETURNED NULL!!!");
                        }
                    }
                    else
                    {
                        tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s);
                        tree = subcategoryStripper.TransformTree(tree);
                    }
                    if (pw != null)
                    {
                        if (parse)
                        {
                            tree.PennPrint(pw);
                        }
                        else
                        {
                            IEnumerator sentIter = s.GetEnumerator();
                            for (; ;)
                            {
                                Word word = (Word)sentIter.Current;
                                pw.Print(word.Word());
                                if (sentIter.MoveNext())
                                {
                                    pw.Print(" ");
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        pw.Println();
                    }
                    if (eval)
                    {
                        ICollection ourBrackets;
                        ICollection goldBrackets;
                        ourBrackets  = proc.AllBrackets(tree);
                        goldBrackets = proc.AllBrackets(gold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree));
                        }
                        basicEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nScores:");
                        basicEval.DisplayLast();
                        Tree collinsTree = collinizer.TransformTree(tree);
                        Tree collinsGold = collinizer.TransformTree(gold);
                        ourBrackets  = proc.AllBrackets(collinsTree);
                        goldBrackets = proc.AllBrackets(collinsGold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree));
                        }
                        collinsEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nCollinized scores:");
                        collinsEval.DisplayLast();
                        System.Console.Out.WriteLine();
                    }
                }
                if (eval)
                {
                    basicEval.Display();
                    System.Console.Out.WriteLine();
                    collinsEval.Display();
                }
            }
        }
示例#5
0
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
        {
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
            {
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
                return;
            }
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    argIndex++;
                    if (numSubArgs > 1)
                    {
                        treebankPath = args[argIndex];
                        argIndex++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            try
                            {
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            }
                            catch (NumberFormatException)
                            {
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                                argIndex++;
                            }
                        }
                    }
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                    {
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        op.tlpParams.SetInputEncoding(encoding);
                        op.tlpParams.SetOutputEncoding(encoding);
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                        {
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                            {
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                {
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                    {
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        argIndex++;
                                        if (numSubArgs == 1)
                                        {
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        }
                                        else
                                        {
                                            if (numSubArgs > 1)
                                            {
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                {
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                }
                                                else
                                                {
                                                    if (numSubArgs >= 3)
                                                    {
                                                        try
                                                        {
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        }
                                                        catch (NumberFormatException)
                                                        {
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                        {
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                            j++;
                                        }
                                        argIndex = j;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
            {
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            }
            if (train)
            {
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                {
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    argIndex++;
                    if (args.Length > argIndex + 1)
                    {
                        try
                        {
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                            argIndex++;
                        }
                    }
                }
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
            }
            else
            {
                if (textInputFileOrUrl != null)
                {
                }
                else
                {
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                    {
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        argIndex++;
                    }
                    try
                    {
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    }
                    catch (ArgumentException)
                    {
                        log.Info("Error loading segmenter, exiting...");
                        System.Environment.Exit(0);
                    }
                }
            }
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
            {
                if (testPath == null)
                {
                    if (treebankPath == null)
                    {
                        throw new Exception("No test treebank path specified...");
                    }
                    else
                    {
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                    }
                }
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            }
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            //
            // -- Roger
            if (op.testOptions.verbose)
            {
                log.Info("Lexicon is " + cs.GetType().FullName);
            }
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
            {
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                }
                else
                {
                    log.Info("Usage: must specify a text segmenter data output path");
                }
            }
            if (saveToSerializedFile)
            {
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                {
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                    argIndex++;
                }
                if (serializedOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                }
                else
                {
                    if (textOutputFileOrUrl == null && testTreebank == null)
                    {
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
                    }
                }
            }
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            {
            }
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
            {
                // test parser on treebank
                if (testTreebank == null)
                {
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                    {
                        testTreebank.LoadPath(args[argIndex + 1]);
                    }
                    else
                    {
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
                    }
                }
            }
        }
        public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description)
        {
            string[] pieces = description.Split(",");
            if (pieces.Length == 1)
            {
                return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null));
            }
            string[] args = new string[pieces.Length - 1];
            System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1);
            string file = pieces[pieces.Length - 1];

            TaggedFileRecord.Format format         = TaggedFileRecord.Format.Text;
            string                 encoding        = GetEncoding(config);
            string                 tagSeparator    = GetTagSeparator(config);
            ITreeTransformer       treeTransformer = null;
            TreeNormalizer         treeNormalizer  = null;
            ITreeReaderFactory     trf             = null;
            NumberRangesFileFilter treeRange       = null;
            IPredicate <Tree>      treeFilter      = null;
            int wordColumn = null;
            int tagColumn  = null;

            foreach (string arg in args)
            {
                string[] argPieces = arg.Split("=", 2);
                if (argPieces.Length != 2)
                {
                    throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s");
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format))
                {
                    format = TaggedFileRecord.Format.ValueOf(argPieces[1]);
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding))
                    {
                        encoding = argPieces[1];
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator))
                        {
                            tagSeparator = argPieces[1];
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer))
                            {
                                treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer))
                                {
                                    treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]);
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader))
                                    {
                                        trf = ReflectionLoading.LoadByReflection(argPieces[1]);
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange))
                                        {
                                            string range = argPieces[1].ReplaceAll(":", ",");
                                            treeRange = new NumberRangesFileFilter(range, true);
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter))
                                            {
                                                treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]);
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn))
                                                {
                                                    wordColumn = int.Parse(argPieces[1]);
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn))
                                                    {
                                                        tagColumn = int.Parse(argPieces[1]);
                                                    }
                                                    else
                                                    {
                                                        throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown");
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn));
        }
示例#7
0
        /// <summary>Lets you test out the TreeAnnotatorAndBinarizer on the command line.</summary>
        /// <param name="args">
        /// Command line arguments: All flags accepted by FactoredParser.setOptionFlag
        /// and -train treebankPath [fileRanges]
        /// </param>
        public static void Main(string[] args)
        {
            Options     op           = new Options();
            string      treebankPath = null;
            IFileFilter trainFilter  = null;
            int         i            = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train"))
                {
                    int numSubArgs = NumSubArgs(args, i);
                    i++;
                    if (numSubArgs >= 1)
                    {
                        treebankPath = args[i];
                        i++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[i++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            int low  = System.Convert.ToInt32(args[i]);
                            int high = System.Convert.ToInt32(args[i + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            i          += 2;
                        }
                    }
                }
                else
                {
                    i = op.SetOption(args, i);
                }
            }
            if (i < args.Length)
            {
                log.Info("usage: java TreeAnnotatorAndBinarizer options*");
                log.Info("  Options are like for lexicalized parser including -train treebankPath fileRange]");
                return;
            }
            log.Info("Annotating from treebank dir: " + treebankPath);
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            if (trainFilter == null)
            {
                trainTreebank.LoadPath(treebankPath);
            }
            else
            {
                trainTreebank.LoadPath(treebankPath, trainFilter);
            }
            Treebank           binaryTrainTreebank = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank, null, null, op).First();
            IEnumerator <Tree> it = trainTreebank.GetEnumerator();

            foreach (Tree t in binaryTrainTreebank)
            {
                System.Console.Out.WriteLine("Original tree:");
                it.Current.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                t.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
        /// <summary>for testing -- CURRENTLY BROKEN!!!</summary>
        /// <param name="args">input dir and output filename</param>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                throw new Exception("args: treebankPath trainNums testNums");
            }
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            ctpp.charTags = true;
            // TODO: these options are getting clobbered by reading in the
            // parser object (unless it's a text file parser?)
            Options op = new Options(ctpp);

            op.doDep = false;
            op.testOptions.maxLength = 90;
            LexicalizedParser lp;

            try
            {
                IFileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
                lp = LexicalizedParser.TrainFromTreebank(args[0], trainFilt, op);
                try
                {
                    string filename = "chineseCharTagPCFG.ser.gz";
                    log.Info("Writing parser in serialized format to file " + filename + " ");
                    System.Console.Error.Flush();
                    ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                    @out.WriteObject(lp);
                    @out.Close();
                    log.Info("done.");
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            catch (ArgumentException)
            {
                lp = LexicalizedParser.LoadModel(args[1], op);
            }
            IFileFilter    testFilt     = new NumberRangesFileFilter(args[2], false);
            MemoryTreebank testTreebank = ctpp.MemoryTreebank();

            testTreebank.LoadPath(new File(args[0]), testFilt);
            PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
            WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
            WordCatEqualityChecker    eqcheck = new WordCatEqualityChecker();
            EquivalenceClassEval      eval    = new EquivalenceClassEval(eqclass, eqcheck);

            //    System.out.println("Preterminals:" + preterminals);
            System.Console.Out.WriteLine("Testing...");
            foreach (Tree gold in testTreebank)
            {
                Tree tree;
                try
                {
                    tree = lp.ParseTree(gold.YieldHasWord());
                    if (tree == null)
                    {
                        System.Console.Out.WriteLine("Failed to parse " + gold.YieldHasWord());
                        continue;
                    }
                }
                catch (Exception e)
                {
                    Sharpen.Runtime.PrintStackTrace(e);
                    continue;
                }
                gold = gold.FirstChild();
                pw.Println(SentenceUtils.ListToString(gold.PreTerminalYield()));
                pw.Println(SentenceUtils.ListToString(gold.Yield()));
                gold.PennPrint(pw);
                pw.Println(tree.PreTerminalYield());
                pw.Println(tree.Yield());
                tree.PennPrint(pw);
                //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
                //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
                //      eval.eval(allBrackets, goldBrackets);
                eval.DisplayLast();
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine();
            eval.Display();
        }