// Records the number of times word/tag pair was seen in training data. // Counts of each tag (stored as a Label) on unknown words. // tag (Label) --> signature --> count public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); tagHash = Generics.NewHashMap(); tc = new ClassicCounter <ILabel>(); c = Generics.NewHashMap(); seenEnd = Generics.NewHashSet(); useEnd = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0); useFirstCap = op.lexOptions.useUnknownWordSignatures > 0; useGT = (op.lexOptions.useUnknownWordSignatures == 0); useFirst = false; if (useFirst) { log.Info("Including first letter for unknown words."); } if (useFirstCap) { log.Info("Including whether first letter is capitalized for unknown words"); } if (useEnd) { log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters."); } if (useGT) { log.Info("Using Good-Turing smoothing for unknown words."); } this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); this.unknownGTTrainer = (useGT) ? new UnknownGTTrainer() : null; this.model = BuildUWM(); }
public SpanishUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter) : base(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null) { this.smartMutation = op.lexOptions.smartMutation; this.unknownSuffixSize = op.lexOptions.unknownSuffixSize; this.unknownPrefixSize = op.lexOptions.unknownPrefixSize; }
public IEnumerable <IToken> Tokenize(ILexicon lexicon, string userInput) { List <IToken> list = new List <IToken>(); string buf = ""; int i = 0; do { if (userInput[i] == ' ' || i == userInput.Length) { list.Add(lexicon.Match(buf)); buf = ""; i++; } else { buf += userInput[i++]; } }while (i < userInput.Length); if (!string.IsNullOrWhiteSpace(buf)) { list.Add(lexicon.Match(buf)); } return(list); }
// boundary tag -- assumed not a real tag public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter); }
public SyntaxService(IParser parser, ILexicon lexicon) { m_parser = parser; m_lexicon = lexicon; // These are the setting I needed, YMMV. They seem to add a lot of UI logic to this method :( m_processingResources = new ProcessingResources(m_lexicon, null, null, m_parser, 5, null, false, null, null, null, null, false); }
public virtual void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { this.totalTrees = totalTrees; this.treesRead = 0; this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.op = op; this.lex = lex; }
public ExhaustiveDependencyParser(IDependencyGrammar dg, ILexicon lex, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { this.dg = dg; this.lex = lex; this.op = op; this.tlp = op.Langpack(); this.wordIndex = wordIndex; this.tagIndex = tagIndex; tf = new LabeledScoredTreeFactory(); }
public DocumentResult(Document doc) { this.docIndex = FactoryDocumentIndex.GetDocumentIndex(); this.lexicon = FactoryLexicon.GetLexicon(); this.DocID = doc.DocID; this.File = doc.File; this.Title = doc.Title; this.WordQuantity = doc.WordQuantity; }
public async Task <bool> AddDictionary(ILexicon lexicon, IEnumerable <IWord> words) { try { if (lexicon == null) { throw new ArgumentNullException(nameof(lexicon)); } using var cnn = _createdDbConnection(); cnn.Open(); var transaction = cnn.BeginTransaction(); // Check if Lexicon exists string checkQuery = "SELECT Language FROM Lexicon WHERE Language=@Language"; var existing = await cnn.ExecuteScalarAsync(checkQuery, new { lexicon.Language }, transaction); if (existing != null) { throw new InvalidConstraintException($"Dictionry [{lexicon.Language}] already exists"); } // Create User string query = $"INSERT INTO Lexicon {_getSqlInsertFields(typeof(Lexicon))}"; var res = await cnn.ExecuteAsync(query, lexicon); if (res == 0) { throw new Exception($"ExecuteAsync failed: {query} [{lexicon.ToJson()}]"); } // Create Words string wordQuery = $"INSERT INTO Word {_getSqlInsertFields(typeof(Word))}" .Replace("@Id,", "") .Replace("Id,", ""); var resWords = await cnn.ExecuteAsync(wordQuery, words); if (resWords != words.Count()) { throw new Exception($"ExecuteAsync failed: {wordQuery}"); } transaction.Commit(); return(true); } catch (Exception ex) { await _log?.WriteErrorAsync(nameof(WordRepository), nameof(AddDictionary), lexicon?.ToJson(), null, ex); return(false); } }
public ArabicUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter) : base(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null) { if (unknownLevel < MinUnknown || unknownLevel > MaxUnknown) { throw new ArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel); } this.smartMutation = op.lexOptions.smartMutation; this.unknownSuffixSize = op.lexOptions.unknownSuffixSize; this.unknownPrefixSize = op.lexOptions.unknownPrefixSize; }
public DocumentResult(int docID) { Document docTmp; this.docIndex = FactoryDocumentIndex.GetDocumentIndex(); this.lexicon = FactoryLexicon.GetLexicon(); docTmp = docIndex.Search(docID); this.DocID = DocID; this.File = docTmp.File; this.Title = docTmp.Title; this.WordQuantity = docTmp.WordQuantity; }
// Records the number of times word/tag pair was seen in training data. // c has a map from tags as Label to a Counter from word // signatures to Strings; it is used to collect counts that will // initialize the probabilities in tagHash // tc record the marginal counts for each tag as an unknown. It // should be the same as c's totalCount ?? public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); bool useGoodTuringUnknownWordModel = ChineseTreebankParserParams.DefaultUseGoodTurningUnknownWordModel; useFirst = true; useGT = (op.lexOptions.useUnknownWordSignatures == 0); if (lex is ChineseLexicon) { useGoodTuringUnknownWordModel = ((ChineseLexicon)lex).useGoodTuringUnknownWordModel; } else { if (op.tlpParams is ChineseTreebankParserParams) { useGoodTuringUnknownWordModel = ((ChineseTreebankParserParams)op.tlpParams).useGoodTuringUnknownWordModel; } } if (useGoodTuringUnknownWordModel) { this.useGT = true; this.useFirst = false; } this.useUnicodeType = op.lexOptions.useUnicodeType; if (useFirst) { log.Info("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + useUnicodeType); } if (useGT) { log.Info("ChineseUWM: using Good-Turing smoothing for unknown words."); } this.c = Generics.NewHashMap(); this.tc = new ClassicCounter <ILabel>(); this.unSeenCounter = new ClassicCounter <IntTaggedWord>(); this.seenCounter = new ClassicCounter <IntTaggedWord>(); this.seenFirst = Generics.NewHashSet(); this.tagHash = Generics.NewHashMap(); this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); this.unknownGTTrainer = (useGT) ? new UnknownGTTrainer() : null; IDictionary <string, float> unknownGT = null; if (useGT) { unknownGT = unknownGTTrainer.unknownGT; } this.model = new ChineseUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, useGT, seenFirst); }
public ChineseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT , bool useGT, ICollection <string> seenFirst) : base(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, null) { // used only for debugging /* These strings are stored in ascii-type Unicode encoding. To * edit them, either use the Unicode codes or use native2ascii or a * similar program to convert the file into a Chinese encoding, then * convert back. */ // uses midDot characters as one clue of being proper name this.useFirst = !useGT; this.useGT = useGT; this.useUnicodeType = op.lexOptions.useUnicodeType; this.seenFirst = seenFirst; }
public TaggingEval(string str, bool runningAverages, ILexicon lex) : base(str, runningAverages) { this.lex = lex; if (doCatLevelEval) { precisions = new ClassicCounter <string>(); recalls = new ClassicCounter <string>(); f1s = new ClassicCounter <string>(); precisions2 = new ClassicCounter <string>(); recalls2 = new ClassicCounter <string>(); pnums2 = new ClassicCounter <string>(); rnums2 = new ClassicCounter <string>(); percentOOV = new ClassicCounter <string>(); percentOOV2 = new ClassicCounter <string>(); } }
internal BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection projection, IIndex <string> stateIndex , IIndex <string> wordIndex, IIndex <string> tagIndex) { this.fscorer = fscorer; this.projection = projection; this.dparser = dparser; this.scorer = scorer; this.bg = bg; this.ug = ug; this.dg = dg; this.lex = lex; this.op = op; this.stateIndex = stateIndex; this.wordIndex = wordIndex; this.tagIndex = tagIndex; tempEdge = new Edge(op.testOptions.exhaustiveTest); tempHook = new Hook(op.testOptions.exhaustiveTest); }
public virtual LexicalizedParser GetParserDataFromTreebank(Treebank trainTreebank) { log.Info("Binarizing training trees..."); IList <Tree> binaryTrainTrees = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank); Timing.Tick("done."); IIndex <string> stateIndex = new HashIndex <string>(); log.Info("Extracting PCFG..."); IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex); Pair <UnaryGrammar, BinaryGrammar> bgug = bgExtractor.Extract(binaryTrainTrees); BinaryGrammar bg = bgug.second; bg.SplitRules(); UnaryGrammar ug = bgug.first; ug.PurgeRules(); Timing.Tick("done."); log.Info("Extracting Lexicon..."); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); ILexicon lex = op.tlpParams.Lex(op, wordIndex, tagIndex); lex.InitializeTraining(binaryTrainTrees.Count); lex.Train(binaryTrainTrees); lex.FinishTraining(); Timing.Tick("done."); IExtractor <IDependencyGrammar> dgExtractor = op.tlpParams.DependencyGrammarExtractor(op, wordIndex, tagIndex); IDependencyGrammar dg = null; if (op.doDep) { log.Info("Extracting Dependencies..."); dg = dgExtractor.Extract(binaryTrainTrees); dg.SetLexicon(lex); Timing.Tick("done."); } log.Info("Done extracting grammars and lexicon."); return(new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op)); }
public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT, ICollection <string> seenEnd) { //= true; // Only care if first is capitalized // only used if useEnd==true endLength = op.lexOptions.unknownSuffixSize; // TODO: refactor these terms into BaseUnknownWordModelTrainer useEnd = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0); useFirstCap = op.lexOptions.useUnknownWordSignatures > 0; useGT = (op.lexOptions.useUnknownWordSignatures == 0); useFirst = false; this.lex = lex; this.trainOptions = op.trainOptions; this.wordIndex = wordIndex; this.tagIndex = tagIndex; this.unSeenCounter = unSeenCounter; this.tagHash = tagHash; this.seenEnd = seenEnd; this.unknownGT = unknownGT; unknownLevel = op.lexOptions.useUnknownWordSignatures; }
/// <summary>This constructor creates an UWM with empty data structures.</summary> /// <remarks> /// This constructor creates an UWM with empty data structures. Only /// use if loading in the data separately, such as by reading in text /// lines containing the data. /// </remarks> public ArabicUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex) : this(op, lex, wordIndex, tagIndex, new ClassicCounter <IntTaggedWord>()) { }
internal LexicalizedParserQuery(LexicalizedParser parser) { this.op = parser.GetOp(); BinaryGrammar bg = parser.bg; UnaryGrammar ug = parser.ug; ILexicon lex = parser.lex; IDependencyGrammar dg = parser.dg; IIndex <string> stateIndex = parser.stateIndex; IIndex <string> wordIndex = new DeltaIndex <string>(parser.wordIndex); IIndex <string> tagIndex = parser.tagIndex; this.debinarizer = new Debinarizer(op.forceCNF); this.boundaryRemover = new BoundaryRemover(); if (op.doPCFG) { if (op.testOptions.iterativeCKY) { pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex); } else { pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex); } } else { pparser = null; } if (op.doDep) { dg.SetLexicon(lex); if (!op.testOptions.useFastFactored) { dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex); } else { dparser = null; } } else { dparser = null; } if (op.doDep && op.doPCFG) { if (op.testOptions.useFastFactored) { MLEDependencyGrammar mledg = (MLEDependencyGrammar)dg; int numToFind = 1; if (op.testOptions.printFactoredKGood > 0) { numToFind = op.testOptions.printFactoredKGood; } bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex); } else { IScorer scorer = new TwinScorer(pparser, dparser); //Scorer scorer = parser; if (op.testOptions.useN5) { bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex); } else { bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex); } } } else { bparser = null; } subcategoryStripper = op.tlpParams.SubcategoryStripper(); }
public EvaluateTreebank(Options op, ILexicon lex, ParserGrammar pqFactory) : this(op, lex, pqFactory, pqFactory.LoadTagger()) { }
internal void UnloadDictionary(ILexicon lexicon) { ITextContext textContext = null; try { _textChunk.get_Context(out textContext); textContext.RemoveLexicon(lexicon); } finally { Marshal.ReleaseComObject(lexicon); if (textContext != null) { Marshal.ReleaseComObject(textContext); } } }
/// <summary>This constructor creates an UWM with empty data structures.</summary> /// <remarks> /// This constructor creates an UWM with empty data structures. Only /// use if loading in the data separately, such as by reading in text /// lines containing the data. /// </remarks> public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex) : this(op, lex, wordIndex, tagIndex, new ClassicCounter <IntTaggedWord>(), Generics.NewHashMap <ILabel, ClassicCounter <string> >(), Generics.NewHashMap <string, float>(), Generics.NewHashSet <string>()) { }
internal N5BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser leach, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection proj, IIndex <string> stateIndex , IIndex <string> wordIndex, IIndex <string> tagIndex) : base(scorer, fscorer, leach, bg, ug, dg, lex, op, proj, stateIndex, wordIndex, tagIndex) { }
public IterativeCKYPCFGParser(BinaryGrammar bg, UnaryGrammar ug, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(bg, ug, lex, op, stateIndex, wordIndex, tagIndex) { }
public EvaluateTreebank(Options op, ILexicon lex, ParserGrammar pqFactory, Func <IList <IHasWord>, IList <TaggedWord> > tagger) { // private final Lexicon lex; // no annotation this.op = op; this.debinarizer = new Debinarizer(op.forceCNF); this.subcategoryStripper = op.tlpParams.SubcategoryStripper(); this.evals = Generics.NewArrayList(); Sharpen.Collections.AddAll(evals, pqFactory.GetExtraEvals()); this.parserQueryEvals = pqFactory.GetParserQueryEvals(); // this.lex = lex; this.pqFactory = pqFactory; this.tagger = tagger; collinizer = op.tlpParams.Collinizer(); boundaryRemover = new BoundaryRemover(); bool runningAverages = bool.Parse(op.testOptions.evals.GetProperty("runningAverages")); summary = bool.Parse(op.testOptions.evals.GetProperty("summary")); tsv = bool.Parse(op.testOptions.evals.GetProperty("tsv")); if (!op.trainOptions.leftToRight) { binarizerOnly = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, false, false, op); } else { binarizerOnly = new TreeAnnotatorAndBinarizer(op.tlpParams.HeadFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, false, false, op); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLB"))) { pcfgLB = new Evalb("pcfg LP/LR", runningAverages); } // TODO: might be nice to allow more than one child-specific scorer if (op.testOptions.evals.GetProperty("pcfgChildSpecific") != null) { string filter = op.testOptions.evals.GetProperty("pcfgChildSpecific"); pcfgChildSpecific = FilteredEval.ChildFilteredEval("pcfg children matching " + filter + " LP/LR", runningAverages, op.Langpack(), filter); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLA"))) { pcfgLA = new LeafAncestorEval("pcfg LeafAncestor"); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCB"))) { pcfgCB = new Evalb.CBEval("pcfg CB", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgDA"))) { pcfgDA = new UnlabeledAttachmentEval("pcfg DA", runningAverages, op.Langpack().HeadFinder()); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgTA"))) { pcfgTA = new TaggingEval("pcfg Tag", runningAverages, lex); } if (bool.Parse(op.testOptions.evals.GetProperty("depDA"))) { depDA = new UnlabeledAttachmentEval("dep DA", runningAverages, null, op.Langpack().PunctuationWordRejectFilter()); } if (bool.Parse(op.testOptions.evals.GetProperty("depTA"))) { depTA = new TaggingEval("dep Tag", runningAverages, lex); } if (bool.Parse(op.testOptions.evals.GetProperty("factLB"))) { factLB = new Evalb("factor LP/LR", runningAverages); } if (op.testOptions.evals.GetProperty("factChildSpecific") != null) { string filter = op.testOptions.evals.GetProperty("factChildSpecific"); factChildSpecific = FilteredEval.ChildFilteredEval("fact children matching " + filter + " LP/LR", runningAverages, op.Langpack(), filter); } if (bool.Parse(op.testOptions.evals.GetProperty("factLA"))) { factLA = new LeafAncestorEval("factor LeafAncestor"); } if (bool.Parse(op.testOptions.evals.GetProperty("factCB"))) { factCB = new Evalb.CBEval("fact CB", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("factDA"))) { factDA = new UnlabeledAttachmentEval("factor DA", runningAverages, null); } if (bool.Parse(op.testOptions.evals.GetProperty("factTA"))) { factTA = new TaggingEval("factor Tag", runningAverages, lex); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgRUO"))) { pcfgRUO = new AbstractEval.RuleErrorEval("pcfg Rule under/over"); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCUO"))) { pcfgCUO = new AbstractEval.CatErrorEval("pcfg Category under/over"); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgCatE"))) { pcfgCatE = new EvalbByCat("pcfg Category Eval", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("pcfgLL"))) { pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("depLL"))) { depLL = new AbstractEval.ScoreEval("depLL", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("factLL"))) { factLL = new AbstractEval.ScoreEval("factLL", runningAverages); } if (bool.Parse(op.testOptions.evals.GetProperty("topMatch"))) { evals.Add(new TopMatchEval("topMatch", runningAverages)); } // this one is for the various k Good/Best options. Just for individual results kGoodLB = new Evalb("kGood LP/LR", false); if (bool.Parse(op.testOptions.evals.GetProperty("pcfgTopK"))) { topKEvals.Add(new BestOfTopKEval(new Evalb("pcfg top k comparisons", false), new Evalb("pcfg top k LP/LR", runningAverages))); } if (topKEvals.Count > 0) { kbestPCFG = op.testOptions.evalPCFGkBest; } if (op.testOptions.printPCFGkBest > 0) { kbestPCFG = Math.Max(kbestPCFG, op.testOptions.printPCFGkBest); } }
public GermanUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT , ICollection <string> seenEnd) : base(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, seenEnd) { }
public virtual void SetLexicon(ILexicon lexicon) { lex = lexicon; }
/* some documentation for Roger's convenience * {pcfg,dep,combo}{PE,DE,TE} are precision/dep/tagging evals for the models * * parser is the PCFG parser * dparser is the dependency parser * bparser is the combining parser * * during testing: * tree is the test tree (gold tree) * binaryTree is the gold tree binarized * tree2b is the best PCFG paser, binarized * tree2 is the best PCFG parse (debinarized) * tree3 is the dependency parse, binarized * tree3db is the dependency parser, debinarized * tree4 is the best combo parse, binarized and then debinarized * tree4b is the best combo parse, binarized */ public static void Main(string[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. StringUtils.LogInvocationString(log, args); string path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200; int trainHigh = 2199; int testLow = 2200; int testHigh = 2219; string serializeFile = null; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-path") && (i + 1 < args.Length)) { path = args[i + 1]; i += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train") && (i + 2 < args.Length)) { trainLow = System.Convert.ToInt32(args[i + 1]); trainHigh = System.Convert.ToInt32(args[i + 2]); i += 3; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-test") && (i + 2 < args.Length)) { testLow = System.Convert.ToInt32(args[i + 1]); testHigh = System.Convert.ToInt32(args[i + 2]); i += 3; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-serialize") && (i + 1 < args.Length)) { serializeFile = args[i + 1]; i += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tLPP") && (i + 1 < args.Length)) { try { op.tlpParams = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (TypeLoadException e) { log.Info("Class not found: " + args[i + 1]); throw new Exception(e); } catch (InstantiationException e) { log.Info("Couldn't instantiate: " + args[i + 1] + ": " + e.ToString()); throw new Exception(e); } catch (MemberAccessException e) { log.Info("illegal access" + e); throw new Exception(e); } i += 2; } else { if (args[i].Equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.SetInputEncoding(args[i + 1]); op.tlpParams.SetOutputEncoding(args[i + 1]); i += 2; } else { i = op.SetOptionOrWarn(args, i); } } } } } } } // System.out.println(tlpParams.getClass()); ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack(); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.Pw(); op.testOptions.Display(); op.trainOptions.Display(); op.Display(); op.tlpParams.Display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.MemoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.TestMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.StartTime(); log.Info("Reading trees..."); testTreebank.LoadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (op.testOptions.increasingLength) { testTreebank.Sort(new TreeLengthComparator()); } trainTreebank.LoadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.Tick("done."); log.Info("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.HeadFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } ITreeTransformer debinarizer = new Debinarizer(op.forceCNF); IList <Tree> binaryTrainTrees = new List <Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.TreebankLanguagePack()); if (op.trainOptions.deleteSplitters != null) { IList <string> deleted = new List <string>(); foreach (string del in op.trainOptions.deleteSplitters) { string baseDel = tlp.BasicCategory(del); bool checkBasic = del.Equals(baseDel); for (IEnumerator <string> it = op.trainOptions.splitters.GetEnumerator(); it.MoveNext();) { string elem = it.Current; string baseElem = tlp.BasicCategory(elem); bool delStr = checkBasic && baseElem.Equals(baseDel) || elem.Equals(del); if (delStr) { it.Remove(); deleted.Add(elem); } } } log.Info("Removed from vertical splitters: " + deleted); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.HeadFinder(), op.tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.TreebankLanguagePack()); } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } //tree.pennPrint(tlpParams.pw()); tree = binarizer.TransformTree(tree); } //binaryTrainTrees.add(tree); binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } if (op.testOptions.verbose) { binarizer.DumpStats(); } IList <Tree> binaryTestTrees = new List <Tree>(); foreach (Tree tree_2 in testTreebank) { if (op.trainOptions.collinsPunc) { tree_2 = collinsPuncTransformer.TransformTree(tree_2); } tree_2 = binarizer.TransformTree(tree_2); binaryTestTrees.Add(tree_2); } Timing.Tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; IDependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; ILexicon lex = null; IIndex <string> stateIndex = new HashIndex <string>(); // extract grammars IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex); //Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); //Extractor dgExtractor = new DependencyMemGrammarExtractor(); if (op.doPCFG) { log.Info("Extracting PCFG..."); Pair <UnaryGrammar, BinaryGrammar> bgug = null; if (op.trainOptions.cheatPCFG) { IList <Tree> allTrees = new List <Tree>(binaryTrainTrees); Sharpen.Collections.AddAll(allTrees, binaryTestTrees); bgug = bgExtractor.Extract(allTrees); } else { bgug = bgExtractor.Extract(binaryTrainTrees); } bg = bgug.second; bg.SplitRules(); ug = bgug.first; ug.PurgeRules(); Timing.Tick("done."); } log.Info("Extracting Lexicon..."); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); lex = op.tlpParams.Lex(op, wordIndex, tagIndex); lex.InitializeTraining(binaryTrainTrees.Count); lex.Train(binaryTrainTrees); lex.FinishTraining(); Timing.Tick("done."); if (op.doDep) { log.Info("Extracting Dependencies..."); binaryTrainTrees.Clear(); IExtractor <IDependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true)); // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true)); //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams)); //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); dg = dgExtractor.Extract(binaryTrainTrees); //uses information whether the words are known or not, discards unknown words Timing.Tick("done."); //System.out.print("Extracting Unknown Word Model..."); //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); //Timing.tick("done."); System.Console.Out.Write("Tuning Dependency Model..."); dg.Tune(binaryTestTrees); //System.out.println("TUNE DEPS: "+tuneDeps); Timing.Tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; IGrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { log.Info("Serializing parser..."); LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op); parser.SaveParserToSerialized(serializeFile); Timing.Tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser_1 = null; if (op.doPCFG) { parser_1 = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex); } ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null); IScorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser_1, gp, op), dparser) : null); //Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex , wordIndex, tagIndex); } Evalb pcfgPE = new Evalb("pcfg PE", true); Evalb comboPE = new Evalb("combo PE", true); AbstractEval pcfgCB = new Evalb.CBEval("pcfg CB", true); AbstractEval pcfgTE = new TaggingEval("pcfg TE"); AbstractEval comboTE = new TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE"); AbstractEval depTE = new TaggingEval("depnd TE"); AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.PunctuationWordRejectFilter()); AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.PunctuationWordRejectFilter()); if (op.testOptions.evalb) { EvalbFormatWriter.InitEVALBfiles(op.tlpParams); } // int[] countByLength = new int[op.testOptions.maxLength+1]; // Use a reflection ruse, so one can run this without needing the // tagger. Using a function rather than a MaxentTagger means we // can distribute a version of the parser that doesn't include the // entire tagger. IFunction <IList <IHasWord>, List <TaggedWord> > tagger = null; if (op.testOptions.preTag) { try { Type[] argsClass = new Type[] { typeof(string) }; object[] arguments = new object[] { op.testOptions.taggerSerializedFile }; tagger = (IFunction <IList <IHasWord>, List <TaggedWord> >)Sharpen.Runtime.GetType("edu.stanford.nlp.tagger.maxent.MaxentTagger").GetConstructor(argsClass).NewInstance(arguments); } catch (Exception e) { log.Info(e); log.Info("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0; tNum < ttSize; tNum++) { Tree tree = testTreebank[tNum]; int testTreeLen = tree_2.Yield().Count; if (testTreeLen > op.testOptions.maxLength) { continue; } Tree binaryTree = binaryTestTrees[tNum]; // countByLength[testTreeLen]++; System.Console.Out.WriteLine("-------------------------------------"); System.Console.Out.WriteLine("Number: " + (tNum + 1)); System.Console.Out.WriteLine("Length: " + testTreeLen); //tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); //System.out.println("Here are the tags in the lexicon:"); //System.out.println(lex.showTags()); //System.out.println("Here's the tagnumberer:"); //System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = Runtime.CurrentTimeMillis(); Timing.Tick("Starting parse."); if (op.doPCFG) { //log.info(op.testOptions.forceTags); if (op.testOptions.forceTags) { if (tagger != null) { //System.out.println("Using a tagger to set tags"); //System.out.println("Tagged sentence as: " + tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser_1.Parse(AddLast(tagger.Apply(CutLast(Wordify(binaryTree.Yield()))))); } else { //System.out.println("Forcing tags to match input."); parser_1.Parse(CleanTags(binaryTree.TaggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser_1.Parse(binaryTree.YieldHasWord()); } } //Timing.tick("Done with pcfg phase."); if (op.doDep) { dparser.Parse(binaryTree.YieldHasWord()); } //Timing.tick("Done with dependency phase."); bool bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.Parse(binaryTree.YieldHasWord()); } //Timing.tick("Done with combination phase."); long timeMil2 = Runtime.CurrentTimeMillis(); long elapsed = timeMil2 - timeMil1; log.Info("Time: " + ((int)(elapsed / 100)) / 10.00 + " sec."); //System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; //System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser_1.GetBestParse(); tree2 = debinarizer.TransformTree(tree2b); } //System.out.println("Debinarized parse..."); //tree2.pennPrint(); //System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.GetBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.TransformTree(tree3); tree3.PennPrint(pw); } //tree.pennPrint(); //((Tree)binaryTrainTrees.get(tNum)).pennPrint(); //System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.GetBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (ArgumentNullException) { log.Info("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } //tree4.pennPrint(); if (op.doDep) { depDE.Evaluate(tree3, binaryTree, pw); depTE.Evaluate(tree3db, tree_2, pw); } ITreeTransformer tc = op.tlpParams.Collinizer(); ITreeTransformer tcEvalb = op.tlpParams.CollinizerEvalb(); if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); //System.out.println("True Best Parse:"); //tree.pennPrint(); //tc.transformTree(tree).pennPrint(); pcfgPE.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw); pcfgCB.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw); Tree tree4b = null; if (op.doDep) { comboDE.Evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.TransformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser_1, debinarizer); tree4 = np.Prune(tree4); } //tree4.pennPrint(); comboPE.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw); } //pcfgTE.evaluate(tree2, tree); pcfgTE.Evaluate(tcEvalb.TransformTree(tree2), tcEvalb.TransformTree(tree_2), pw); pcfgTEnoPunct.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw); if (op.doDep) { comboTE.Evaluate(tcEvalb.TransformTree(tree4), tcEvalb.TransformTree(tree_2), pw); comboTEnoPunct.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw); } System.Console.Out.WriteLine("PCFG only: " + parser_1.ScoreBinarizedTree(tree2b, 0)); //tc.transformTree(tree2).pennPrint(); tree2.PennPrint(pw); if (op.doDep) { System.Console.Out.WriteLine("Combo: " + parser_1.ScoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.PennPrint(pw); } System.Console.Out.WriteLine("Correct:" + parser_1.ScoreBinarizedTree(binaryTree, 0)); /* * if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { * System.out.println("SCORE INVERSION"); * parser.validateBinarizedTree(binaryTree,0); * } */ tree_2.PennPrint(pw); } // end if doPCFG if (op.testOptions.evalb) { if (op.doPCFG && op.doDep) { EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree4)); } else { if (op.doPCFG) { EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree2)); } else { if (op.doDep) { EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree3db)); } } } } } // end for each tree in test treebank if (op.testOptions.evalb) { EvalbFormatWriter.CloseEVALBfiles(); } // op.testOptions.display(); if (op.doPCFG) { pcfgPE.Display(false, pw); System.Console.Out.WriteLine("Grammar size: " + stateIndex.Size()); pcfgCB.Display(false, pw); if (op.doDep) { comboPE.Display(false, pw); } pcfgTE.Display(false, pw); pcfgTEnoPunct.Display(false, pw); if (op.doDep) { comboTE.Display(false, pw); comboTEnoPunct.Display(false, pw); } } if (op.doDep) { depTE.Display(false, pw); depDE.Display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.Display(false, pw); } }
public virtual void SetLex(ILexicon lex) { this.lex = lex; }
IndexerMemory() { this.lexicon = FactoryLexicon.GetLexicon(); this.documentIndex = FactoryDocumentIndex.GetDocumentIndex(); this.repDoc = FactoryRepositoryDocument.GetRepositoryDocument(EnumRepositoryType.Folder); }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap(); flagsToNumArgs["-parser"] = int.Parse(3); flagsToNumArgs["-lex"] = int.Parse(3); flagsToNumArgs["-test"] = int.Parse(2); flagsToNumArgs["-out"] = int.Parse(1); flagsToNumArgs["-lengthPenalty"] = int.Parse(1); flagsToNumArgs["-penaltyType"] = int.Parse(1); flagsToNumArgs["-maxLength"] = int.Parse(1); flagsToNumArgs["-stats"] = int.Parse(2); IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs); bool eval = argMap.Contains("-eval"); PrintWriter pw = null; if (argMap.Contains("-out")) { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true); } log.Info("ChineseCharacterBasedLexicon called with args:"); ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); for (int i = 0; i < args.Length; i++) { ctpp.SetOptionFlag(args, i); log.Info(" " + args[i]); } log.Info(); Options op = new Options(ctpp); if (argMap.Contains("-stats")) { string[] statArgs = (argMap["-stats"]); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(statArgs[1], false); rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { trainTreebank.Add(annotator.TransformTree(tree)); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } PrintStats(trainTreebank, pw); System.Environment.Exit(0); } int maxLength = 1000000; // Test.verbose = true; if (argMap.Contains("-norm")) { op.testOptions.lengthNormalization = true; } if (argMap.Contains("-maxLength")) { maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]); } op.testOptions.maxLength = 120; bool combo = argMap.Contains("-combo"); if (combo) { ctpp.useCharacterBasedLexicon = true; op.testOptions.maxSpanForTags = 10; op.doDep = false; op.dcTags = false; } LexicalizedParser lp = null; ILexicon lex = null; if (argMap.Contains("-parser")) { string[] parserArgs = (argMap["-parser"]); if (parserArgs.Length > 1) { IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false); lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op); if (parserArgs.Length == 3) { string filename = parserArgs[2]; log.Info("Writing parser in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lp); @out.Close(); log.Info("done."); } } else { string parserFile = parserArgs[0]; lp = LexicalizedParser.LoadModel(parserFile, op); } lex = lp.GetLexicon(); op = lp.GetOp(); ctpp = (ChineseTreebankParserParams)op.tlpParams; } if (argMap.Contains("-rad")) { ctpp.useUnknownCharacterModel = true; } if (argMap.Contains("-lengthPenalty")) { ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]); } if (argMap.Contains("-penaltyType")) { ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]); } if (argMap.Contains("-lex")) { string[] lexArgs = (argMap["-lex"]); if (lexArgs.Length > 1) { IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); lex = ctpp.Lex(op, wordIndex, tagIndex); MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank(); IFileFilter trainFilt = new NumberRangesFileFilter(lexArgs[1], false); rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt); log.Info("Done reading trees."); MemoryTreebank trainTreebank; if (argMap.Contains("-annotate")) { trainTreebank = new MemoryTreebank(); TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op); foreach (Tree tree in rawTrainTreebank) { tree = annotator.TransformTree(tree); trainTreebank.Add(tree); } log.Info("Done annotating trees."); } else { trainTreebank = rawTrainTreebank; } lex.InitializeTraining(trainTreebank.Count); lex.Train(trainTreebank); lex.FinishTraining(); log.Info("Done training lexicon."); if (lexArgs.Length == 3) { string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz"; log.Info("Writing lexicon in serialized format to file " + filename + " "); System.Console.Error.Flush(); ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename); @out.WriteObject(lex); @out.Close(); log.Info("done."); } } else { string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz"; log.Info("Reading Lexicon from file " + lexFile); ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile); try { lex = (ILexicon)@in.ReadObject(); } catch (TypeLoadException) { throw new Exception("Bad serialized file: " + lexFile); } @in.Close(); } } if (argMap.Contains("-test")) { bool segmentWords = ctpp.segment; bool parse = lp != null; System.Diagnostics.Debug.Assert((parse || segmentWords)); // WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords"); // WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags"); IWordSegmenter seg = null; if (segmentWords) { seg = (IWordSegmenter)lex; } string[] testArgs = (argMap["-test"]); MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank(); IFileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false); testTreebank.LoadPath(new File(testArgs[0]), testFilt); ITreeTransformer subcategoryStripper = op.tlpParams.SubcategoryStripper(); ITreeTransformer collinizer = ctpp.Collinizer(); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval basicEval = new EquivalenceClassEval(eqclass, eqcheck, "basic"); EquivalenceClassEval collinsEval = new EquivalenceClassEval(eqclass, eqcheck, "collinized"); IList <string> evalTypes = new List <string>(3); bool goodPOS = false; if (segmentWords) { evalTypes.Add(WordCatConstituent.wordType); if (ctpp.segmentMarkov && !parse) { evalTypes.Add(WordCatConstituent.tagType); goodPOS = true; } } if (parse) { evalTypes.Add(WordCatConstituent.tagType); evalTypes.Add(WordCatConstituent.catType); if (combo) { evalTypes.Add(WordCatConstituent.wordType); goodPOS = true; } } TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes); log.Info("Testing..."); foreach (Tree goldTop in testTreebank) { Tree gold = goldTop.FirstChild(); IList <IHasWord> goldSentence = gold.YieldHasWord(); if (goldSentence.Count > maxLength) { log.Info("Skipping sentence; too long: " + goldSentence.Count); continue; } else { log.Info("Processing sentence; length: " + goldSentence.Count); } IList <IHasWord> s; if (segmentWords) { StringBuilder goldCharBuf = new StringBuilder(); foreach (IHasWord aGoldSentence in goldSentence) { StringLabel word = (StringLabel)aGoldSentence; goldCharBuf.Append(word.Value()); } string goldChars = goldCharBuf.ToString(); s = seg.Segment(goldChars); } else { s = goldSentence; } Tree tree; if (parse) { tree = lp.ParseTree(s); if (tree == null) { throw new Exception("PARSER RETURNED NULL!!!"); } } else { tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s); tree = subcategoryStripper.TransformTree(tree); } if (pw != null) { if (parse) { tree.PennPrint(pw); } else { IEnumerator sentIter = s.GetEnumerator(); for (; ;) { Word word = (Word)sentIter.Current; pw.Print(word.Word()); if (sentIter.MoveNext()) { pw.Print(" "); } else { break; } } } pw.Println(); } if (eval) { ICollection ourBrackets; ICollection goldBrackets; ourBrackets = proc.AllBrackets(tree); goldBrackets = proc.AllBrackets(gold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree)); } basicEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nScores:"); basicEval.DisplayLast(); Tree collinsTree = collinizer.TransformTree(tree); Tree collinsGold = collinizer.TransformTree(gold); ourBrackets = proc.AllBrackets(collinsTree); goldBrackets = proc.AllBrackets(collinsGold); if (goodPOS) { Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold)); Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree)); } collinsEval.Eval(ourBrackets, goldBrackets); System.Console.Out.WriteLine("\nCollinized scores:"); collinsEval.DisplayLast(); System.Console.Out.WriteLine(); } } if (eval) { basicEval.Display(); System.Console.Out.WriteLine(); collinsEval.Display(); } } }
public BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex, IIndex <string> tagIndex) : this(scorer, fscorer, dparser, bg, ug, dg, lex, op, new NullGrammarProjection(bg, ug), stateIndex, wordIndex, tagIndex) { }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; Language lang = Language.English; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i]); } } PrintWriter pw = tlpp.Pw(); Options op = new Options(); Options.LexOptions lexOptions = op.lexOptions; if (lang == Language.French) { lexOptions.useUnknownWordSignatures = 1; lexOptions.smartMutation = false; lexOptions.unknownSuffixSize = 2; lexOptions.unknownPrefixSize = 1; } else { if (lang == Language.Arabic) { lexOptions.smartMutation = false; lexOptions.useUnknownWordSignatures = 9; lexOptions.unknownPrefixSize = 1; lexOptions.unknownSuffixSize = 1; } } IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); ILexicon lex = tlpp.Lex(op, wordIndex, tagIndex); int computeAfter = (int)(0.50 * tb.Count); ICounter <string> vocab = new ClassicCounter <string>(); ICounter <string> unkCounter = new ClassicCounter <string>(); int treeId = 0; foreach (Tree t in tb) { IList <ILabel> yield = t.Yield(); int posId = 0; foreach (ILabel word in yield) { vocab.IncrementCount(word.Value()); if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0) { // if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK")) // pw.println(word.value()); unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++)); } } treeId++; } IList <string> biggestKeys = new List <string>(unkCounter.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter)); foreach (string wordType in biggestKeys) { pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType)); } pw.Close(); pw.Close(); }
public SExpressionLexer(ILexicon lexicon) : this(lexicon?.TokenTypes) { }