// Records the number of times word/tag pair was seen in training data.
 // Counts of each tag (stored as a Label) on unknown words.
 // tag (Label) --> signature --> count
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     seenCounter   = new ClassicCounter <IntTaggedWord>();
     unSeenCounter = new ClassicCounter <IntTaggedWord>();
     tagHash       = Generics.NewHashMap();
     tc            = new ClassicCounter <ILabel>();
     c             = Generics.NewHashMap();
     seenEnd       = Generics.NewHashSet();
     useEnd        = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0);
     useFirstCap   = op.lexOptions.useUnknownWordSignatures > 0;
     useGT         = (op.lexOptions.useUnknownWordSignatures == 0);
     useFirst      = false;
     if (useFirst)
     {
         log.Info("Including first letter for unknown words.");
     }
     if (useFirstCap)
     {
         log.Info("Including whether first letter is capitalized for unknown words");
     }
     if (useEnd)
     {
         log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters.");
     }
     if (useGT)
     {
         log.Info("Using Good-Turing smoothing for unknown words.");
     }
     this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     this.unknownGTTrainer        = (useGT) ? new UnknownGTTrainer() : null;
     this.model = BuildUWM();
 }
Esempio n. 2
0
 public SpanishUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter)
     : base(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null)
 {
     this.smartMutation     = op.lexOptions.smartMutation;
     this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
     this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
 }
        public IEnumerable <IToken> Tokenize(ILexicon lexicon, string userInput)
        {
            List <IToken> list = new List <IToken>();

            string buf = "";
            int    i   = 0;

            do
            {
                if (userInput[i] == ' ' || i == userInput.Length)
                {
                    list.Add(lexicon.Match(buf));
                    buf = "";
                    i++;
                }
                else
                {
                    buf += userInput[i++];
                }
            }while (i < userInput.Length);

            if (!string.IsNullOrWhiteSpace(buf))
            {
                list.Add(lexicon.Match(buf));
            }

            return(list);
        }
Esempio n. 4
0
 // boundary tag -- assumed not a real tag
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     seenCounter             = new ClassicCounter <IntTaggedWord>();
     unSeenCounter           = new ClassicCounter <IntTaggedWord>();
     model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter);
 }
    public SyntaxService(IParser parser, ILexicon lexicon)
    {
        m_parser  = parser;
        m_lexicon = lexicon;

        // These are the setting I needed, YMMV.  They seem to add a lot of UI logic to this method :(
        m_processingResources = new ProcessingResources(m_lexicon, null, null, m_parser, 5, null, false, null, null, null, null, false);
    }
Esempio n. 6
0
 public virtual void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     this.totalTrees = totalTrees;
     this.treesRead  = 0;
     this.wordIndex  = wordIndex;
     this.tagIndex   = tagIndex;
     this.op         = op;
     this.lex        = lex;
 }
 public ExhaustiveDependencyParser(IDependencyGrammar dg, ILexicon lex, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     this.dg        = dg;
     this.lex       = lex;
     this.op        = op;
     this.tlp       = op.Langpack();
     this.wordIndex = wordIndex;
     this.tagIndex  = tagIndex;
     tf             = new LabeledScoredTreeFactory();
 }
Esempio n. 8
0
        public DocumentResult(Document doc)
        {
            this.docIndex = FactoryDocumentIndex.GetDocumentIndex();
            this.lexicon = FactoryLexicon.GetLexicon();

            this.DocID = doc.DocID;
            this.File = doc.File;
            this.Title = doc.Title;
            this.WordQuantity = doc.WordQuantity;
        }
Esempio n. 9
0
        public async Task <bool> AddDictionary(ILexicon lexicon, IEnumerable <IWord> words)
        {
            try
            {
                if (lexicon == null)
                {
                    throw new ArgumentNullException(nameof(lexicon));
                }

                using var cnn = _createdDbConnection();
                cnn.Open();
                var transaction = cnn.BeginTransaction();

                // Check if Lexicon exists
                string checkQuery = "SELECT Language FROM Lexicon WHERE Language=@Language";
                var    existing   = await cnn.ExecuteScalarAsync(checkQuery, new { lexicon.Language }, transaction);

                if (existing != null)
                {
                    throw new InvalidConstraintException($"Dictionry [{lexicon.Language}] already exists");
                }

                // Create User
                string query = $"INSERT INTO Lexicon {_getSqlInsertFields(typeof(Lexicon))}";

                var res = await cnn.ExecuteAsync(query, lexicon);

                if (res == 0)
                {
                    throw new Exception($"ExecuteAsync failed: {query} [{lexicon.ToJson()}]");
                }

                // Create Words
                string wordQuery = $"INSERT INTO Word {_getSqlInsertFields(typeof(Word))}"
                                   .Replace("@Id,", "")
                                   .Replace("Id,", "");

                var resWords = await cnn.ExecuteAsync(wordQuery, words);

                if (resWords != words.Count())
                {
                    throw new Exception($"ExecuteAsync failed: {wordQuery}");
                }

                transaction.Commit();

                return(true);
            }
            catch (Exception ex)
            {
                await _log?.WriteErrorAsync(nameof(WordRepository), nameof(AddDictionary), lexicon?.ToJson(), null, ex);

                return(false);
            }
        }
 public ArabicUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter)
     : base(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null)
 {
     if (unknownLevel < MinUnknown || unknownLevel > MaxUnknown)
     {
         throw new ArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel);
     }
     this.smartMutation     = op.lexOptions.smartMutation;
     this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
     this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
 }
Esempio n. 11
0
        public DocumentResult(int docID)
        {
            Document docTmp;

            this.docIndex = FactoryDocumentIndex.GetDocumentIndex();
            this.lexicon = FactoryLexicon.GetLexicon();

            docTmp = docIndex.Search(docID);

            this.DocID = DocID;
            this.File = docTmp.File;
            this.Title = docTmp.Title;
            this.WordQuantity = docTmp.WordQuantity;
        }
        // Records the number of times word/tag pair was seen in training data.
        // c has a map from tags as Label to a Counter from word
        // signatures to Strings; it is used to collect counts that will
        // initialize the probabilities in tagHash
        // tc record the marginal counts for each tag as an unknown.  It
        // should be the same as c's totalCount ??
        public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
        {
            base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
            bool useGoodTuringUnknownWordModel = ChineseTreebankParserParams.DefaultUseGoodTurningUnknownWordModel;

            useFirst = true;
            useGT    = (op.lexOptions.useUnknownWordSignatures == 0);
            if (lex is ChineseLexicon)
            {
                useGoodTuringUnknownWordModel = ((ChineseLexicon)lex).useGoodTuringUnknownWordModel;
            }
            else
            {
                if (op.tlpParams is ChineseTreebankParserParams)
                {
                    useGoodTuringUnknownWordModel = ((ChineseTreebankParserParams)op.tlpParams).useGoodTuringUnknownWordModel;
                }
            }
            if (useGoodTuringUnknownWordModel)
            {
                this.useGT    = true;
                this.useFirst = false;
            }
            this.useUnicodeType = op.lexOptions.useUnicodeType;
            if (useFirst)
            {
                log.Info("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + useUnicodeType);
            }
            if (useGT)
            {
                log.Info("ChineseUWM: using Good-Turing smoothing for unknown words.");
            }
            this.c                       = Generics.NewHashMap();
            this.tc                      = new ClassicCounter <ILabel>();
            this.unSeenCounter           = new ClassicCounter <IntTaggedWord>();
            this.seenCounter             = new ClassicCounter <IntTaggedWord>();
            this.seenFirst               = Generics.NewHashSet();
            this.tagHash                 = Generics.NewHashMap();
            this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
            this.unknownGTTrainer        = (useGT) ? new UnknownGTTrainer() : null;
            IDictionary <string, float> unknownGT = null;

            if (useGT)
            {
                unknownGT = unknownGTTrainer.unknownGT;
            }
            this.model = new ChineseUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, useGT, seenFirst);
        }
Esempio n. 13
0
        public ChineseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT
                                       , bool useGT, ICollection <string> seenFirst)
            : base(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, null)
        {
            // used only for debugging

            /* These strings are stored in ascii-type Unicode encoding.  To
             * edit them, either use the Unicode codes or use native2ascii or a
             * similar program to convert the file into a Chinese encoding, then
             * convert back. */
            // uses midDot characters as one clue of being proper name
            this.useFirst       = !useGT;
            this.useGT          = useGT;
            this.useUnicodeType = op.lexOptions.useUnicodeType;
            this.seenFirst      = seenFirst;
        }
Esempio n. 14
0
 public TaggingEval(string str, bool runningAverages, ILexicon lex)
     : base(str, runningAverages)
 {
     this.lex = lex;
     if (doCatLevelEval)
     {
         precisions  = new ClassicCounter <string>();
         recalls     = new ClassicCounter <string>();
         f1s         = new ClassicCounter <string>();
         precisions2 = new ClassicCounter <string>();
         recalls2    = new ClassicCounter <string>();
         pnums2      = new ClassicCounter <string>();
         rnums2      = new ClassicCounter <string>();
         percentOOV  = new ClassicCounter <string>();
         percentOOV2 = new ClassicCounter <string>();
     }
 }
 internal BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection projection, IIndex <string> stateIndex
                          , IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     this.fscorer    = fscorer;
     this.projection = projection;
     this.dparser    = dparser;
     this.scorer     = scorer;
     this.bg         = bg;
     this.ug         = ug;
     this.dg         = dg;
     this.lex        = lex;
     this.op         = op;
     this.stateIndex = stateIndex;
     this.wordIndex  = wordIndex;
     this.tagIndex   = tagIndex;
     tempEdge        = new Edge(op.testOptions.exhaustiveTest);
     tempHook        = new Hook(op.testOptions.exhaustiveTest);
 }
Esempio n. 16
0
        public virtual LexicalizedParser GetParserDataFromTreebank(Treebank trainTreebank)
        {
            log.Info("Binarizing training trees...");
            IList <Tree> binaryTrainTrees = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank);

            Timing.Tick("done.");
            IIndex <string> stateIndex = new HashIndex <string>();

            log.Info("Extracting PCFG...");
            IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
            Pair <UnaryGrammar, BinaryGrammar> bgug = bgExtractor.Extract(binaryTrainTrees);
            BinaryGrammar bg = bgug.second;

            bg.SplitRules();
            UnaryGrammar ug = bgug.first;

            ug.PurgeRules();
            Timing.Tick("done.");
            log.Info("Extracting Lexicon...");
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            ILexicon        lex       = op.tlpParams.Lex(op, wordIndex, tagIndex);

            lex.InitializeTraining(binaryTrainTrees.Count);
            lex.Train(binaryTrainTrees);
            lex.FinishTraining();
            Timing.Tick("done.");
            IExtractor <IDependencyGrammar> dgExtractor = op.tlpParams.DependencyGrammarExtractor(op, wordIndex, tagIndex);
            IDependencyGrammar dg = null;

            if (op.doDep)
            {
                log.Info("Extracting Dependencies...");
                dg = dgExtractor.Extract(binaryTrainTrees);
                dg.SetLexicon(lex);
                Timing.Tick("done.");
            }
            log.Info("Done extracting grammars and lexicon.");
            return(new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op));
        }
Esempio n. 17
0
 public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT,
                             ICollection <string> seenEnd)
 {
     //= true;
     // Only care if first is capitalized
     // only used if useEnd==true
     endLength = op.lexOptions.unknownSuffixSize;
     // TODO: refactor these terms into BaseUnknownWordModelTrainer
     useEnd             = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0);
     useFirstCap        = op.lexOptions.useUnknownWordSignatures > 0;
     useGT              = (op.lexOptions.useUnknownWordSignatures == 0);
     useFirst           = false;
     this.lex           = lex;
     this.trainOptions  = op.trainOptions;
     this.wordIndex     = wordIndex;
     this.tagIndex      = tagIndex;
     this.unSeenCounter = unSeenCounter;
     this.tagHash       = tagHash;
     this.seenEnd       = seenEnd;
     this.unknownGT     = unknownGT;
     unknownLevel       = op.lexOptions.useUnknownWordSignatures;
 }
 /// <summary>This constructor creates an UWM with empty data structures.</summary>
 /// <remarks>
 /// This constructor creates an UWM with empty data structures.  Only
 /// use if loading in the data separately, such as by reading in text
 /// lines containing the data.
 /// </remarks>
 public ArabicUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : this(op, lex, wordIndex, tagIndex, new ClassicCounter <IntTaggedWord>())
 {
 }
        internal LexicalizedParserQuery(LexicalizedParser parser)
        {
            this.op = parser.GetOp();
            BinaryGrammar      bg         = parser.bg;
            UnaryGrammar       ug         = parser.ug;
            ILexicon           lex        = parser.lex;
            IDependencyGrammar dg         = parser.dg;
            IIndex <string>    stateIndex = parser.stateIndex;
            IIndex <string>    wordIndex  = new DeltaIndex <string>(parser.wordIndex);
            IIndex <string>    tagIndex   = parser.tagIndex;

            this.debinarizer     = new Debinarizer(op.forceCNF);
            this.boundaryRemover = new BoundaryRemover();
            if (op.doPCFG)
            {
                if (op.testOptions.iterativeCKY)
                {
                    pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
                else
                {
                    pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
            }
            else
            {
                pparser = null;
            }
            if (op.doDep)
            {
                dg.SetLexicon(lex);
                if (!op.testOptions.useFastFactored)
                {
                    dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex);
                }
                else
                {
                    dparser = null;
                }
            }
            else
            {
                dparser = null;
            }
            if (op.doDep && op.doPCFG)
            {
                if (op.testOptions.useFastFactored)
                {
                    MLEDependencyGrammar mledg = (MLEDependencyGrammar)dg;
                    int numToFind = 1;
                    if (op.testOptions.printFactoredKGood > 0)
                    {
                        numToFind = op.testOptions.printFactoredKGood;
                    }
                    bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex);
                }
                else
                {
                    IScorer scorer = new TwinScorer(pparser, dparser);
                    //Scorer scorer = parser;
                    if (op.testOptions.useN5)
                    {
                        bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                    else
                    {
                        bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                }
            }
            else
            {
                bparser = null;
            }
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
        }
 public EvaluateTreebank(Options op, ILexicon lex, ParserGrammar pqFactory)
     : this(op, lex, pqFactory, pqFactory.LoadTagger())
 {
 }
Esempio n. 21
0
        internal void UnloadDictionary(ILexicon lexicon)
        {
            ITextContext textContext = null;
            try
            {
                _textChunk.get_Context(out textContext);
                textContext.RemoveLexicon(lexicon);
            }
            finally
            {
                Marshal.ReleaseComObject(lexicon);

                if (textContext != null)
                {
                    Marshal.ReleaseComObject(textContext);
                }
            }
        }
Esempio n. 22
0
 /// <summary>This constructor creates an UWM with empty data structures.</summary>
 /// <remarks>
 /// This constructor creates an UWM with empty data structures.  Only
 /// use if loading in the data separately, such as by reading in text
 /// lines containing the data.
 /// </remarks>
 public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : this(op, lex, wordIndex, tagIndex, new ClassicCounter <IntTaggedWord>(), Generics.NewHashMap <ILabel, ClassicCounter <string> >(), Generics.NewHashMap <string, float>(), Generics.NewHashSet <string>())
 {
 }
 internal N5BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser leach, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection proj, IIndex <string> stateIndex
                            , IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(scorer, fscorer, leach, bg, ug, dg, lex, op, proj, stateIndex, wordIndex, tagIndex)
 {
 }
Esempio n. 24
0
 public IterativeCKYPCFGParser(BinaryGrammar bg, UnaryGrammar ug, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(bg, ug, lex, op, stateIndex, wordIndex, tagIndex)
 {
 }
        public EvaluateTreebank(Options op, ILexicon lex, ParserGrammar pqFactory, Func <IList <IHasWord>, IList <TaggedWord> > tagger)
        {
            // private final Lexicon lex;
            // no annotation
            this.op                  = op;
            this.debinarizer         = new Debinarizer(op.forceCNF);
            this.subcategoryStripper = op.tlpParams.SubcategoryStripper();
            this.evals               = Generics.NewArrayList();
            Sharpen.Collections.AddAll(evals, pqFactory.GetExtraEvals());
            this.parserQueryEvals = pqFactory.GetParserQueryEvals();
            // this.lex = lex;
            this.pqFactory  = pqFactory;
            this.tagger     = tagger;
            collinizer      = op.tlpParams.Collinizer();
            boundaryRemover = new BoundaryRemover();
            bool runningAverages = bool.Parse(op.testOptions.evals.GetProperty("runningAverages"));

            summary = bool.Parse(op.testOptions.evals.GetProperty("summary"));
            tsv     = bool.Parse(op.testOptions.evals.GetProperty("tsv"));
            if (!op.trainOptions.leftToRight)
            {
                binarizerOnly = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, false, false, op);
            }
            else
            {
                binarizerOnly = new TreeAnnotatorAndBinarizer(op.tlpParams.HeadFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, false, false, op);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLB")))
            {
                pcfgLB = new Evalb("pcfg LP/LR", runningAverages);
            }
            // TODO: might be nice to allow more than one child-specific scorer
            if (op.testOptions.evals.GetProperty("pcfgChildSpecific") != null)
            {
                string filter = op.testOptions.evals.GetProperty("pcfgChildSpecific");
                pcfgChildSpecific = FilteredEval.ChildFilteredEval("pcfg children matching " + filter + " LP/LR", runningAverages, op.Langpack(), filter);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLA")))
            {
                pcfgLA = new LeafAncestorEval("pcfg LeafAncestor");
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCB")))
            {
                pcfgCB = new Evalb.CBEval("pcfg CB", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgDA")))
            {
                pcfgDA = new UnlabeledAttachmentEval("pcfg DA", runningAverages, op.Langpack().HeadFinder());
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgTA")))
            {
                pcfgTA = new TaggingEval("pcfg Tag", runningAverages, lex);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("depDA")))
            {
                depDA = new UnlabeledAttachmentEval("dep DA", runningAverages, null, op.Langpack().PunctuationWordRejectFilter());
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("depTA")))
            {
                depTA = new TaggingEval("dep Tag", runningAverages, lex);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factLB")))
            {
                factLB = new Evalb("factor LP/LR", runningAverages);
            }
            if (op.testOptions.evals.GetProperty("factChildSpecific") != null)
            {
                string filter = op.testOptions.evals.GetProperty("factChildSpecific");
                factChildSpecific = FilteredEval.ChildFilteredEval("fact children matching " + filter + " LP/LR", runningAverages, op.Langpack(), filter);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factLA")))
            {
                factLA = new LeafAncestorEval("factor LeafAncestor");
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factCB")))
            {
                factCB = new Evalb.CBEval("fact CB", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factDA")))
            {
                factDA = new UnlabeledAttachmentEval("factor DA", runningAverages, null);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factTA")))
            {
                factTA = new TaggingEval("factor Tag", runningAverages, lex);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgRUO")))
            {
                pcfgRUO = new AbstractEval.RuleErrorEval("pcfg Rule under/over");
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCUO")))
            {
                pcfgCUO = new AbstractEval.CatErrorEval("pcfg Category under/over");
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCatE")))
            {
                pcfgCatE = new EvalbByCat("pcfg Category Eval", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLL")))
            {
                pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("depLL")))
            {
                depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("factLL")))
            {
                factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
            }
            if (bool.Parse(op.testOptions.evals.GetProperty("topMatch")))
            {
                evals.Add(new TopMatchEval("topMatch", runningAverages));
            }
            // this one is for the various k Good/Best options.  Just for individual results
            kGoodLB = new Evalb("kGood LP/LR", false);
            if (bool.Parse(op.testOptions.evals.GetProperty("pcfgTopK")))
            {
                topKEvals.Add(new BestOfTopKEval(new Evalb("pcfg top k comparisons", false), new Evalb("pcfg top k LP/LR", runningAverages)));
            }
            if (topKEvals.Count > 0)
            {
                kbestPCFG = op.testOptions.evalPCFGkBest;
            }
            if (op.testOptions.printPCFGkBest > 0)
            {
                kbestPCFG = Math.Max(kbestPCFG, op.testOptions.printPCFGkBest);
            }
        }
Esempio n. 26
0
 public GermanUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT
                               , ICollection <string> seenEnd)
     : base(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, seenEnd)
 {
 }
Esempio n. 27
0
 public virtual void SetLexicon(ILexicon lexicon)
 {
     lex = lexicon;
 }
Esempio n. 28
0
        /* some documentation for Roger's convenience
         * {pcfg,dep,combo}{PE,DE,TE} are precision/dep/tagging evals for the models
         *
         * parser is the PCFG parser
         * dparser is the dependency parser
         * bparser is the combining parser
         *
         * during testing:
         * tree is the test tree (gold tree)
         * binaryTree is the gold tree binarized
         * tree2b is the best PCFG paser, binarized
         * tree2 is the best PCFG parse (debinarized)
         * tree3 is the dependency parse, binarized
         * tree3db is the dependency parser, debinarized
         * tree4 is the best combo parse, binarized and then debinarized
         * tree4b is the best combo parse, binarized
         */
        public static void Main(string[] args)
        {
            Options op = new Options(new EnglishTreebankParserParams());

            // op.tlpParams may be changed to something else later, so don't use it till
            // after options are parsed.
            StringUtils.LogInvocationString(log, args);
            string path          = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
            int    trainLow      = 200;
            int    trainHigh     = 2199;
            int    testLow       = 2200;
            int    testHigh      = 2219;
            string serializeFile = null;
            int    i             = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-path") && (i + 1 < args.Length))
                {
                    path = args[i + 1];
                    i   += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train") && (i + 2 < args.Length))
                    {
                        trainLow  = System.Convert.ToInt32(args[i + 1]);
                        trainHigh = System.Convert.ToInt32(args[i + 2]);
                        i        += 3;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-test") && (i + 2 < args.Length))
                        {
                            testLow  = System.Convert.ToInt32(args[i + 1]);
                            testHigh = System.Convert.ToInt32(args[i + 2]);
                            i       += 3;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-serialize") && (i + 1 < args.Length))
                            {
                                serializeFile = args[i + 1];
                                i            += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tLPP") && (i + 1 < args.Length))
                                {
                                    try
                                    {
                                        op.tlpParams = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                                    }
                                    catch (TypeLoadException e)
                                    {
                                        log.Info("Class not found: " + args[i + 1]);
                                        throw new Exception(e);
                                    }
                                    catch (InstantiationException e)
                                    {
                                        log.Info("Couldn't instantiate: " + args[i + 1] + ": " + e.ToString());
                                        throw new Exception(e);
                                    }
                                    catch (MemberAccessException e)
                                    {
                                        log.Info("illegal access" + e);
                                        throw new Exception(e);
                                    }
                                    i += 2;
                                }
                                else
                                {
                                    if (args[i].Equals("-encoding"))
                                    {
                                        // sets encoding for TreebankLangParserParams
                                        op.tlpParams.SetInputEncoding(args[i + 1]);
                                        op.tlpParams.SetOutputEncoding(args[i + 1]);
                                        i += 2;
                                    }
                                    else
                                    {
                                        i = op.SetOptionOrWarn(args, i);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // System.out.println(tlpParams.getClass());
            ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack();

            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
            //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
            PrintWriter pw = op.tlpParams.Pw();

            op.testOptions.Display();
            op.trainOptions.Display();
            op.Display();
            op.tlpParams.Display();
            // setup tree transforms
            Treebank       trainTreebank = op.tlpParams.MemoryTreebank();
            MemoryTreebank testTreebank  = op.tlpParams.TestMemoryTreebank();

            // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
            // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
            // blippTreebank.loadPath(blippPath, "", true);
            Timing.StartTime();
            log.Info("Reading trees...");
            testTreebank.LoadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
            if (op.testOptions.increasingLength)
            {
                testTreebank.Sort(new TreeLengthComparator());
            }
            trainTreebank.LoadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
            Timing.Tick("done.");
            log.Info("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.HeadFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlp);
            }
            ITreeTransformer debinarizer      = new Debinarizer(op.forceCNF);
            IList <Tree>     binaryTrainTrees = new List <Tree>();

            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.TreebankLanguagePack());
                if (op.trainOptions.deleteSplitters != null)
                {
                    IList <string> deleted = new List <string>();
                    foreach (string del in op.trainOptions.deleteSplitters)
                    {
                        string baseDel    = tlp.BasicCategory(del);
                        bool   checkBasic = del.Equals(baseDel);
                        for (IEnumerator <string> it = op.trainOptions.splitters.GetEnumerator(); it.MoveNext();)
                        {
                            string elem     = it.Current;
                            string baseElem = tlp.BasicCategory(elem);
                            bool   delStr   = checkBasic && baseElem.Equals(baseDel) || elem.Equals(del);
                            if (delStr)
                            {
                                it.Remove();
                                deleted.Add(elem);
                            }
                        }
                    }
                    log.Info("Removed from vertical splitters: " + deleted);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.HeadFinder(), op.tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.TreebankLanguagePack());
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    //tree.pennPrint(tlpParams.pw());
                    tree = binarizer.TransformTree(tree);
                }
                //binaryTrainTrees.add(tree);
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            IList <Tree> binaryTestTrees = new List <Tree>();

            foreach (Tree tree_2 in testTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_2 = collinsPuncTransformer.TransformTree(tree_2);
                }
                tree_2 = binarizer.TransformTree(tree_2);
                binaryTestTrees.Add(tree_2);
            }
            Timing.Tick("done.");
            // binarization
            BinaryGrammar      bg = null;
            UnaryGrammar       ug = null;
            IDependencyGrammar dg = null;
            // DependencyGrammar dgBLIPP = null;
            ILexicon        lex        = null;
            IIndex <string> stateIndex = new HashIndex <string>();
            // extract grammars
            IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex);

            //Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
            // Extractor lexExtractor = new LexiconExtractor();
            //Extractor dgExtractor = new DependencyMemGrammarExtractor();
            if (op.doPCFG)
            {
                log.Info("Extracting PCFG...");
                Pair <UnaryGrammar, BinaryGrammar> bgug = null;
                if (op.trainOptions.cheatPCFG)
                {
                    IList <Tree> allTrees = new List <Tree>(binaryTrainTrees);
                    Sharpen.Collections.AddAll(allTrees, binaryTestTrees);
                    bgug = bgExtractor.Extract(allTrees);
                }
                else
                {
                    bgug = bgExtractor.Extract(binaryTrainTrees);
                }
                bg = bgug.second;
                bg.SplitRules();
                ug = bgug.first;
                ug.PurgeRules();
                Timing.Tick("done.");
            }
            log.Info("Extracting Lexicon...");
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();

            lex = op.tlpParams.Lex(op, wordIndex, tagIndex);
            lex.InitializeTraining(binaryTrainTrees.Count);
            lex.Train(binaryTrainTrees);
            lex.FinishTraining();
            Timing.Tick("done.");
            if (op.doDep)
            {
                log.Info("Extracting Dependencies...");
                binaryTrainTrees.Clear();
                IExtractor <IDependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
                // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));
                // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
                //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));
                //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
                // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
                dg = dgExtractor.Extract(binaryTrainTrees);
                //uses information whether the words are known or not, discards unknown words
                Timing.Tick("done.");
                //System.out.print("Extracting Unknown Word Model...");
                //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
                //Timing.tick("done.");
                System.Console.Out.Write("Tuning Dependency Model...");
                dg.Tune(binaryTestTrees);
                //System.out.println("TUNE DEPS: "+tuneDeps);
                Timing.Tick("done.");
            }
            BinaryGrammar      boundBG = bg;
            UnaryGrammar       boundUG = ug;
            IGrammarProjection gp      = new NullGrammarProjection(bg, ug);

            // serialization
            if (serializeFile != null)
            {
                log.Info("Serializing parser...");
                LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
                parser.SaveParserToSerialized(serializeFile);
                Timing.Tick("done.");
            }
            // test: pcfg-parse and output
            ExhaustivePCFGParser parser_1 = null;

            if (op.doPCFG)
            {
                parser_1 = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
            }
            ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);
            IScorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser_1, gp, op), dparser) : null);
            //Scorer scorer = parser;
            BiLexPCFGParser bparser = null;

            if (op.doPCFG && op.doDep)
            {
                bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex
                                                                                                                                                                                                    , wordIndex, tagIndex);
            }
            Evalb        pcfgPE         = new Evalb("pcfg  PE", true);
            Evalb        comboPE        = new Evalb("combo PE", true);
            AbstractEval pcfgCB         = new Evalb.CBEval("pcfg  CB", true);
            AbstractEval pcfgTE         = new TaggingEval("pcfg  TE");
            AbstractEval comboTE        = new TaggingEval("combo TE");
            AbstractEval pcfgTEnoPunct  = new TaggingEval("pcfg nopunct TE");
            AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
            AbstractEval depTE          = new TaggingEval("depnd TE");
            AbstractEval depDE          = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.PunctuationWordRejectFilter());
            AbstractEval comboDE        = new UnlabeledAttachmentEval("combo DE", true, null, tlp.PunctuationWordRejectFilter());

            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(op.tlpParams);
            }
            // int[] countByLength = new int[op.testOptions.maxLength+1];
            // Use a reflection ruse, so one can run this without needing the
            // tagger.  Using a function rather than a MaxentTagger means we
            // can distribute a version of the parser that doesn't include the
            // entire tagger.
            IFunction <IList <IHasWord>, List <TaggedWord> > tagger = null;

            if (op.testOptions.preTag)
            {
                try
                {
                    Type[]   argsClass = new Type[] { typeof(string) };
                    object[] arguments = new object[] { op.testOptions.taggerSerializedFile };
                    tagger = (IFunction <IList <IHasWord>, List <TaggedWord> >)Sharpen.Runtime.GetType("edu.stanford.nlp.tagger.maxent.MaxentTagger").GetConstructor(argsClass).NewInstance(arguments);
                }
                catch (Exception e)
                {
                    log.Info(e);
                    log.Info("Warning: No pretagging of sentences will be done.");
                }
            }
            for (int tNum = 0; tNum < ttSize; tNum++)
            {
                Tree tree        = testTreebank[tNum];
                int  testTreeLen = tree_2.Yield().Count;
                if (testTreeLen > op.testOptions.maxLength)
                {
                    continue;
                }
                Tree binaryTree = binaryTestTrees[tNum];
                // countByLength[testTreeLen]++;
                System.Console.Out.WriteLine("-------------------------------------");
                System.Console.Out.WriteLine("Number: " + (tNum + 1));
                System.Console.Out.WriteLine("Length: " + testTreeLen);
                //tree.pennPrint(pw);
                // System.out.println("XXXX The binary tree is");
                // binaryTree.pennPrint(pw);
                //System.out.println("Here are the tags in the lexicon:");
                //System.out.println(lex.showTags());
                //System.out.println("Here's the tagnumberer:");
                //System.out.println(Numberer.getGlobalNumberer("tags").toString());
                long timeMil1 = Runtime.CurrentTimeMillis();
                Timing.Tick("Starting parse.");
                if (op.doPCFG)
                {
                    //log.info(op.testOptions.forceTags);
                    if (op.testOptions.forceTags)
                    {
                        if (tagger != null)
                        {
                            //System.out.println("Using a tagger to set tags");
                            //System.out.println("Tagged sentence as: " + tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
                            parser_1.Parse(AddLast(tagger.Apply(CutLast(Wordify(binaryTree.Yield())))));
                        }
                        else
                        {
                            //System.out.println("Forcing tags to match input.");
                            parser_1.Parse(CleanTags(binaryTree.TaggedYield(), tlp));
                        }
                    }
                    else
                    {
                        // System.out.println("XXXX Parsing " + binaryTree.yield());
                        parser_1.Parse(binaryTree.YieldHasWord());
                    }
                }
                //Timing.tick("Done with pcfg phase.");
                if (op.doDep)
                {
                    dparser.Parse(binaryTree.YieldHasWord());
                }
                //Timing.tick("Done with dependency phase.");
                bool bothPassed = false;
                if (op.doPCFG && op.doDep)
                {
                    bothPassed = bparser.Parse(binaryTree.YieldHasWord());
                }
                //Timing.tick("Done with combination phase.");
                long timeMil2 = Runtime.CurrentTimeMillis();
                long elapsed  = timeMil2 - timeMil1;
                log.Info("Time: " + ((int)(elapsed / 100)) / 10.00 + " sec.");
                //System.out.println("PCFG Best Parse:");
                Tree tree2b = null;
                Tree tree2  = null;
                //System.out.println("Got full best parse...");
                if (op.doPCFG)
                {
                    tree2b = parser_1.GetBestParse();
                    tree2  = debinarizer.TransformTree(tree2b);
                }
                //System.out.println("Debinarized parse...");
                //tree2.pennPrint();
                //System.out.println("DepG Best Parse:");
                Tree tree3   = null;
                Tree tree3db = null;
                if (op.doDep)
                {
                    tree3 = dparser.GetBestParse();
                    // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
                    tree3db = debinarizer.TransformTree(tree3);
                    tree3.PennPrint(pw);
                }
                //tree.pennPrint();
                //((Tree)binaryTrainTrees.get(tNum)).pennPrint();
                //System.out.println("Combo Best Parse:");
                Tree tree4 = null;
                if (op.doPCFG && op.doDep)
                {
                    try
                    {
                        tree4 = bparser.GetBestParse();
                        if (tree4 == null)
                        {
                            tree4 = tree2b;
                        }
                    }
                    catch (ArgumentNullException)
                    {
                        log.Info("Blocked, using PCFG parse!");
                        tree4 = tree2b;
                    }
                }
                if (op.doPCFG && !bothPassed)
                {
                    tree4 = tree2b;
                }
                //tree4.pennPrint();
                if (op.doDep)
                {
                    depDE.Evaluate(tree3, binaryTree, pw);
                    depTE.Evaluate(tree3db, tree_2, pw);
                }
                ITreeTransformer tc      = op.tlpParams.Collinizer();
                ITreeTransformer tcEvalb = op.tlpParams.CollinizerEvalb();
                if (op.doPCFG)
                {
                    // System.out.println("XXXX Best PCFG was: ");
                    // tree2.pennPrint();
                    // System.out.println("XXXX Transformed best PCFG is: ");
                    // tc.transformTree(tree2).pennPrint();
                    //System.out.println("True Best Parse:");
                    //tree.pennPrint();
                    //tc.transformTree(tree).pennPrint();
                    pcfgPE.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    pcfgCB.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    Tree tree4b = null;
                    if (op.doDep)
                    {
                        comboDE.Evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
                        tree4b = tree4;
                        tree4  = debinarizer.TransformTree(tree4);
                        if (op.nodePrune)
                        {
                            NodePruner np = new NodePruner(parser_1, debinarizer);
                            tree4 = np.Prune(tree4);
                        }
                        //tree4.pennPrint();
                        comboPE.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw);
                    }
                    //pcfgTE.evaluate(tree2, tree);
                    pcfgTE.Evaluate(tcEvalb.TransformTree(tree2), tcEvalb.TransformTree(tree_2), pw);
                    pcfgTEnoPunct.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    if (op.doDep)
                    {
                        comboTE.Evaluate(tcEvalb.TransformTree(tree4), tcEvalb.TransformTree(tree_2), pw);
                        comboTEnoPunct.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw);
                    }
                    System.Console.Out.WriteLine("PCFG only: " + parser_1.ScoreBinarizedTree(tree2b, 0));
                    //tc.transformTree(tree2).pennPrint();
                    tree2.PennPrint(pw);
                    if (op.doDep)
                    {
                        System.Console.Out.WriteLine("Combo: " + parser_1.ScoreBinarizedTree(tree4b, 0));
                        // tc.transformTree(tree4).pennPrint(pw);
                        tree4.PennPrint(pw);
                    }
                    System.Console.Out.WriteLine("Correct:" + parser_1.ScoreBinarizedTree(binaryTree, 0));

                    /*
                     * if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
                     * System.out.println("SCORE INVERSION");
                     * parser.validateBinarizedTree(binaryTree,0);
                     * }
                     */
                    tree_2.PennPrint(pw);
                }
                // end if doPCFG
                if (op.testOptions.evalb)
                {
                    if (op.doPCFG && op.doDep)
                    {
                        EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree4));
                    }
                    else
                    {
                        if (op.doPCFG)
                        {
                            EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree2));
                        }
                        else
                        {
                            if (op.doDep)
                            {
                                EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree3db));
                            }
                        }
                    }
                }
            }
            // end for each tree in test treebank
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            // op.testOptions.display();
            if (op.doPCFG)
            {
                pcfgPE.Display(false, pw);
                System.Console.Out.WriteLine("Grammar size: " + stateIndex.Size());
                pcfgCB.Display(false, pw);
                if (op.doDep)
                {
                    comboPE.Display(false, pw);
                }
                pcfgTE.Display(false, pw);
                pcfgTEnoPunct.Display(false, pw);
                if (op.doDep)
                {
                    comboTE.Display(false, pw);
                    comboTEnoPunct.Display(false, pw);
                }
            }
            if (op.doDep)
            {
                depTE.Display(false, pw);
                depDE.Display(false, pw);
            }
            if (op.doPCFG && op.doDep)
            {
                comboDE.Display(false, pw);
            }
        }
Esempio n. 29
0
 public virtual void SetLex(ILexicon lex)
 {
     this.lex = lex;
 }
Esempio n. 30
0
 IndexerMemory()
 {
     this.lexicon = FactoryLexicon.GetLexicon();
     this.documentIndex = FactoryDocumentIndex.GetDocumentIndex();
     this.repDoc = FactoryRepositoryDocument.GetRepositoryDocument(EnumRepositoryType.Folder);
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-parser"]        = int.Parse(3);
            flagsToNumArgs["-lex"]           = int.Parse(3);
            flagsToNumArgs["-test"]          = int.Parse(2);
            flagsToNumArgs["-out"]           = int.Parse(1);
            flagsToNumArgs["-lengthPenalty"] = int.Parse(1);
            flagsToNumArgs["-penaltyType"]   = int.Parse(1);
            flagsToNumArgs["-maxLength"]     = int.Parse(1);
            flagsToNumArgs["-stats"]         = int.Parse(2);
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);
            bool        eval = argMap.Contains("-eval");
            PrintWriter pw   = null;

            if (argMap.Contains("-out"))
            {
                pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true);
            }
            log.Info("ChineseCharacterBasedLexicon called with args:");
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            for (int i = 0; i < args.Length; i++)
            {
                ctpp.SetOptionFlag(args, i);
                log.Info(" " + args[i]);
            }
            log.Info();
            Options op = new Options(ctpp);

            if (argMap.Contains("-stats"))
            {
                string[]       statArgs         = (argMap["-stats"]);
                MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    trainFilt        = new NumberRangesFileFilter(statArgs[1], false);
                rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt);
                log.Info("Done reading trees.");
                MemoryTreebank trainTreebank;
                if (argMap.Contains("-annotate"))
                {
                    trainTreebank = new MemoryTreebank();
                    TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                    foreach (Tree tree in rawTrainTreebank)
                    {
                        trainTreebank.Add(annotator.TransformTree(tree));
                    }
                    log.Info("Done annotating trees.");
                }
                else
                {
                    trainTreebank = rawTrainTreebank;
                }
                PrintStats(trainTreebank, pw);
                System.Environment.Exit(0);
            }
            int maxLength = 1000000;

            //    Test.verbose = true;
            if (argMap.Contains("-norm"))
            {
                op.testOptions.lengthNormalization = true;
            }
            if (argMap.Contains("-maxLength"))
            {
                maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]);
            }
            op.testOptions.maxLength = 120;
            bool combo = argMap.Contains("-combo");

            if (combo)
            {
                ctpp.useCharacterBasedLexicon = true;
                op.testOptions.maxSpanForTags = 10;
                op.doDep  = false;
                op.dcTags = false;
            }
            LexicalizedParser lp  = null;
            ILexicon          lex = null;

            if (argMap.Contains("-parser"))
            {
                string[] parserArgs = (argMap["-parser"]);
                if (parserArgs.Length > 1)
                {
                    IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
                    lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op);
                    if (parserArgs.Length == 3)
                    {
                        string filename = parserArgs[2];
                        log.Info("Writing parser in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lp);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string parserFile = parserArgs[0];
                    lp = LexicalizedParser.LoadModel(parserFile, op);
                }
                lex  = lp.GetLexicon();
                op   = lp.GetOp();
                ctpp = (ChineseTreebankParserParams)op.tlpParams;
            }
            if (argMap.Contains("-rad"))
            {
                ctpp.useUnknownCharacterModel = true;
            }
            if (argMap.Contains("-lengthPenalty"))
            {
                ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]);
            }
            if (argMap.Contains("-penaltyType"))
            {
                ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]);
            }
            if (argMap.Contains("-lex"))
            {
                string[] lexArgs = (argMap["-lex"]);
                if (lexArgs.Length > 1)
                {
                    IIndex <string> wordIndex = new HashIndex <string>();
                    IIndex <string> tagIndex  = new HashIndex <string>();
                    lex = ctpp.Lex(op, wordIndex, tagIndex);
                    MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                    IFileFilter    trainFilt        = new NumberRangesFileFilter(lexArgs[1], false);
                    rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt);
                    log.Info("Done reading trees.");
                    MemoryTreebank trainTreebank;
                    if (argMap.Contains("-annotate"))
                    {
                        trainTreebank = new MemoryTreebank();
                        TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                        foreach (Tree tree in rawTrainTreebank)
                        {
                            tree = annotator.TransformTree(tree);
                            trainTreebank.Add(tree);
                        }
                        log.Info("Done annotating trees.");
                    }
                    else
                    {
                        trainTreebank = rawTrainTreebank;
                    }
                    lex.InitializeTraining(trainTreebank.Count);
                    lex.Train(trainTreebank);
                    lex.FinishTraining();
                    log.Info("Done training lexicon.");
                    if (lexArgs.Length == 3)
                    {
                        string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                        log.Info("Writing lexicon in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lex);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
                    log.Info("Reading Lexicon from file " + lexFile);
                    ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile);
                    try
                    {
                        lex = (ILexicon)@in.ReadObject();
                    }
                    catch (TypeLoadException)
                    {
                        throw new Exception("Bad serialized file: " + lexFile);
                    }
                    @in.Close();
                }
            }
            if (argMap.Contains("-test"))
            {
                bool segmentWords = ctpp.segment;
                bool parse        = lp != null;
                System.Diagnostics.Debug.Assert((parse || segmentWords));
                //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
                //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
                IWordSegmenter seg = null;
                if (segmentWords)
                {
                    seg = (IWordSegmenter)lex;
                }
                string[]       testArgs     = (argMap["-test"]);
                MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    testFilt     = new NumberRangesFileFilter(testArgs[1], false);
                testTreebank.LoadPath(new File(testArgs[0]), testFilt);
                ITreeTransformer          subcategoryStripper = op.tlpParams.SubcategoryStripper();
                ITreeTransformer          collinizer          = ctpp.Collinizer();
                WordCatEquivalenceClasser eqclass             = new WordCatEquivalenceClasser();
                WordCatEqualityChecker    eqcheck             = new WordCatEqualityChecker();
                EquivalenceClassEval      basicEval           = new EquivalenceClassEval(eqclass, eqcheck, "basic");
                EquivalenceClassEval      collinsEval         = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
                IList <string>            evalTypes           = new List <string>(3);
                bool goodPOS = false;
                if (segmentWords)
                {
                    evalTypes.Add(WordCatConstituent.wordType);
                    if (ctpp.segmentMarkov && !parse)
                    {
                        evalTypes.Add(WordCatConstituent.tagType);
                        goodPOS = true;
                    }
                }
                if (parse)
                {
                    evalTypes.Add(WordCatConstituent.tagType);
                    evalTypes.Add(WordCatConstituent.catType);
                    if (combo)
                    {
                        evalTypes.Add(WordCatConstituent.wordType);
                        goodPOS = true;
                    }
                }
                TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
                log.Info("Testing...");
                foreach (Tree goldTop in testTreebank)
                {
                    Tree             gold         = goldTop.FirstChild();
                    IList <IHasWord> goldSentence = gold.YieldHasWord();
                    if (goldSentence.Count > maxLength)
                    {
                        log.Info("Skipping sentence; too long: " + goldSentence.Count);
                        continue;
                    }
                    else
                    {
                        log.Info("Processing sentence; length: " + goldSentence.Count);
                    }
                    IList <IHasWord> s;
                    if (segmentWords)
                    {
                        StringBuilder goldCharBuf = new StringBuilder();
                        foreach (IHasWord aGoldSentence in goldSentence)
                        {
                            StringLabel word = (StringLabel)aGoldSentence;
                            goldCharBuf.Append(word.Value());
                        }
                        string goldChars = goldCharBuf.ToString();
                        s = seg.Segment(goldChars);
                    }
                    else
                    {
                        s = goldSentence;
                    }
                    Tree tree;
                    if (parse)
                    {
                        tree = lp.ParseTree(s);
                        if (tree == null)
                        {
                            throw new Exception("PARSER RETURNED NULL!!!");
                        }
                    }
                    else
                    {
                        tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s);
                        tree = subcategoryStripper.TransformTree(tree);
                    }
                    if (pw != null)
                    {
                        if (parse)
                        {
                            tree.PennPrint(pw);
                        }
                        else
                        {
                            IEnumerator sentIter = s.GetEnumerator();
                            for (; ;)
                            {
                                Word word = (Word)sentIter.Current;
                                pw.Print(word.Word());
                                if (sentIter.MoveNext())
                                {
                                    pw.Print(" ");
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        pw.Println();
                    }
                    if (eval)
                    {
                        ICollection ourBrackets;
                        ICollection goldBrackets;
                        ourBrackets  = proc.AllBrackets(tree);
                        goldBrackets = proc.AllBrackets(gold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree));
                        }
                        basicEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nScores:");
                        basicEval.DisplayLast();
                        Tree collinsTree = collinizer.TransformTree(tree);
                        Tree collinsGold = collinizer.TransformTree(gold);
                        ourBrackets  = proc.AllBrackets(collinsTree);
                        goldBrackets = proc.AllBrackets(collinsGold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree));
                        }
                        collinsEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nCollinized scores:");
                        collinsEval.DisplayLast();
                        System.Console.Out.WriteLine();
                    }
                }
                if (eval)
                {
                    basicEval.Display();
                    System.Console.Out.WriteLine();
                    collinsEval.Display();
                }
            }
        }
 public BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex,
                        IIndex <string> tagIndex)
     : this(scorer, fscorer, dparser, bg, ug, dg, lex, op, new NullGrammarProjection(bg, ug), stateIndex, wordIndex, tagIndex)
 {
 }
Esempio n. 33
0
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(usage.ToString());
                System.Environment.Exit(-1);
            }
            ITreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            DiskTreebank tb       = null;
            string       encoding = "UTF-8";
            Language     lang     = Language.English;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].StartsWith("-"))
                {
                    switch (args[i])
                    {
                    case "-l":
                    {
                        lang = Language.ValueOf(args[++i].Trim());
                        tlpp = lang.@params;
                        break;
                    }

                    case "-e":
                    {
                        encoding = args[++i];
                        break;
                    }

                    default:
                    {
                        System.Console.Out.WriteLine(usage.ToString());
                        System.Environment.Exit(-1);
                        break;
                    }
                    }
                }
                else
                {
                    if (tb == null)
                    {
                        if (tlpp == null)
                        {
                            System.Console.Out.WriteLine(usage.ToString());
                            System.Environment.Exit(-1);
                        }
                        else
                        {
                            tlpp.SetInputEncoding(encoding);
                            tlpp.SetOutputEncoding(encoding);
                            tb = tlpp.DiskTreebank();
                        }
                    }
                    tb.LoadPath(args[i]);
                }
            }
            PrintWriter pw = tlpp.Pw();
            Options     op = new Options();

            Options.LexOptions lexOptions = op.lexOptions;
            if (lang == Language.French)
            {
                lexOptions.useUnknownWordSignatures = 1;
                lexOptions.smartMutation            = false;
                lexOptions.unknownSuffixSize        = 2;
                lexOptions.unknownPrefixSize        = 1;
            }
            else
            {
                if (lang == Language.Arabic)
                {
                    lexOptions.smartMutation            = false;
                    lexOptions.useUnknownWordSignatures = 9;
                    lexOptions.unknownPrefixSize        = 1;
                    lexOptions.unknownSuffixSize        = 1;
                }
            }
            IIndex <string>   wordIndex    = new HashIndex <string>();
            IIndex <string>   tagIndex     = new HashIndex <string>();
            ILexicon          lex          = tlpp.Lex(op, wordIndex, tagIndex);
            int               computeAfter = (int)(0.50 * tb.Count);
            ICounter <string> vocab        = new ClassicCounter <string>();
            ICounter <string> unkCounter   = new ClassicCounter <string>();
            int               treeId       = 0;

            foreach (Tree t in tb)
            {
                IList <ILabel> yield = t.Yield();
                int            posId = 0;
                foreach (ILabel word in yield)
                {
                    vocab.IncrementCount(word.Value());
                    if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0)
                    {
                        //          if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK"))
                        //            pw.println(word.value());
                        unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++));
                    }
                }
                treeId++;
            }
            IList <string> biggestKeys = new List <string>(unkCounter.KeySet());

            biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter));
            foreach (string wordType in biggestKeys)
            {
                pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType));
            }
            pw.Close();
            pw.Close();
        }
 public SExpressionLexer(ILexicon lexicon) : this(lexicon?.TokenTypes)
 {
 }