private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex) { Timing.StartTime(); ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); foreach (Tree goldTree in testTreebank) { goldTree = annotator.TransformTree(goldTree); // System.out.println(); // System.out.println("Checking tree: " + goldTree); foreach (Tree localTree in goldTree) { // now try to use the grammar to score this local tree if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2) { continue; } System.Console.Out.WriteLine(LocalTreeToRule(localTree)); double score = ComputeLocalTreeScore(localTree, stateIndex, pd); if (score == double.NegativeInfinity) { } // System.out.println(localTreeToRule(localTree)); System.Console.Out.WriteLine("score: " + score); } } }
//use Stanford.NLP.Net to parse the sentence static Tree Parse(string sent) { // Loading english PCFG parser from file var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz"); var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sentReader = new java.io.StringReader(sent); var rawWords = tokenizerFactory.getTokenizer(sentReader).tokenize(); sentReader.close(); var tree = lp.apply(rawWords); // Extract dependencies from lexical tree var tlp = new PennTreebankLanguagePack(); var gsf = tlp.grammaticalStructureFactory(); var gs = gsf.newGrammaticalStructure(tree); var tdl = gs.typedDependenciesCCprocessed(); // Extract collapsed dependencies from parsed tree //var tp = new TreePrint("penn,typedDependenciesCollapsed"); var tp = new TreePrint("penn"); tp.printTree(tree); return(tree); }
// todo: perhaps the output streams could be passed in /// <summary> /// Parse the files with names given in the String array args elements from /// index argIndex on. /// </summary> /// <remarks> /// Parse the files with names given in the String array args elements from /// index argIndex on. Convenience method which builds and invokes a ParseFiles object. /// </remarks> public static void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter , Options op, TreePrint treePrint, LexicalizedParser pqFactory) where _T0 : IHasWord { Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles pf = new Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles(op, treePrint, pqFactory); pf.ParseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter); }
public FilterConfusingRules(LexicalizedParser parser) { BinaryGrammar binaryGrammar = parser.bg; UnaryGrammar unaryGrammar = parser.ug; Options op = parser.GetOp(); IIndex <string> stateIndex = parser.stateIndex; foreach (UnaryRule unaryRule in unaryGrammar) { // only make one matrix for each parent state, and only use the // basic category for that string childState = stateIndex.Get(unaryRule.child); string childBasic = op.Langpack().BasicCategory(childState); unaryRules.Add(childBasic); } foreach (BinaryRule binaryRule in binaryGrammar) { // only make one matrix for each parent state, and only use the // basic category for that string leftState = stateIndex.Get(binaryRule.leftChild); string leftBasic = op.Langpack().BasicCategory(leftState); string rightState = stateIndex.Get(binaryRule.rightChild); string rightBasic = op.Langpack().BasicCategory(rightState); binaryRules.Add(leftBasic, rightBasic); } }
public static void ParseString(string sentence) { // Path to models extracted from `stanford-parser-3.6.0-models.jar` var modelsDirectory = @"../../../data/paket-files/stanford-corenlp-3.9.1-models/edu/stanford/nlp/models"; var model = @"/lexparser/englishPCFG.ser.gz"; //var model = @"/parser/nndep/english_SD.gz"; // Loading english PCFG parser from file var lp = LexicalizedParser.loadModel(modelsDirectory + model); // This sample shows parsing a list of correctly tokenized words //var rawWords = SentenceUtils.toCoreLabelList(sentence); //var tree = lp.apply(rawWords); //tree.pennPrint(); // This option shows loading and using an explicit tokenizer var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new StringReader(sentence); var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); //sent2Reader.close(); var tree2 = lp.apply(rawWords2); // Extract dependencies from lexical tree //var tlp = new PennTreebankLanguagePack(); //var gsf = tlp.grammaticalStructureFactory(); //var gs = gsf.newGrammaticalStructure(tree2); //var tdl = gs.typedDependenciesCCprocessed(); //Console.WriteLine("\n{0}\n", tdl); // Extract collapsed dependencies from parsed tree var tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree2); }
public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words var sent = new[] { "This", "is", "an", "easy", "sentence", "." }; java.util.List rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); // This option shows loading and using an explicit tokenizer const string Sent2 = "This is another sentence."; TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new StringReader(Sent2); java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); parse = lp.apply(rawWords2); var tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); java.util.List tdl = gs.typedDependenciesCCprocessed(); Console.WriteLine("\n{0}\n", tdl); var tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
public static IList <Tree> GetTopParsesForOneTree(LexicalizedParser parser, int dvKBest, Tree tree, ITreeTransformer transformer) { IParserQuery pq = parser.ParserQuery(); IList <Word> sentence = tree.YieldWords(); // Since the trees are binarized and otherwise manipulated, we // need to chop off the last word in order to remove the end of // sentence symbol if (sentence.Count <= 1) { return(null); } sentence = sentence.SubList(0, sentence.Count - 1); if (!pq.Parse(sentence)) { log.Info("Failed to use the given parser to reparse sentence \"" + sentence + "\""); return(null); } IList <Tree> parses = new List <Tree>(); IList <ScoredObject <Tree> > bestKParses = pq.GetKBestPCFGParses(dvKBest); foreach (ScoredObject <Tree> so in bestKParses) { Tree result = so.Object(); if (transformer != null) { result = transformer.TransformTree(result); } parses.Add(result); } return(parses); }
/// <summary> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. /// </summary> /// <remarks> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. Output is handled with a /// TreePrint object. Note that the options used when creating the /// TreePrint can determine what results to print out. Once again, /// one can capture the output by passing a PrintWriter to /// TreePrint.printTree. This code is for English. /// </remarks> public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent); Tree parse = lp.Apply(rawWords); parse.PennPrint(); System.Console.Out.WriteLine(); // This option shows loading and using an explicit tokenizer string sent2 = "This is another sentence."; ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); ITokenizer <CoreLabel> tok = tokenizerFactory.GetTokenizer(new StringReader(sent2)); IList <CoreLabel> rawWords2 = tok.Tokenize(); parse = lp.Apply(rawWords2); ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.PrintTree(parse); }
public StanfordParsingService() { var path = Path.GetTempPath(); if (!System.IO.File.Exists(path + "englishPCFG.ser.gz")) System.IO.File.WriteAllBytes(path + "englishPCFG.ser.gz", Smartifyer.Resources.englishPCFG_ser); lp = LexicalizedParser.loadModel(path + "englishPCFG.ser.gz"); }
public void ProcessText(string inputText) { var jarRoot = "C:\\stanford-parser-full-2016-10-31\\stanford-parser-3.7.0-models";//\\edu\\stanford\\nlp\\models";//"nlp.stanford.edu\\stanford-parser-full-2017-06-09\\models"; var modelsDirectory = jarRoot + "\\edu\\stanford\\nlp\\models"; // Loading english PCFG parser from file var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz"); // This option shows loading and using an explicit tokenizer var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sentReader = new StringReader(inputText); var rawWords = tokenizerFactory.getTokenizer(sentReader).tokenize(); sentReader.close(); var tree = lp.apply(rawWords); //Extract dependencies from lexical tree var tlp = new PennTreebankLanguagePack(); var gsf = tlp.grammaticalStructureFactory(); var gs = gsf.newGrammaticalStructure(tree); var tdl = gs.typedDependenciesCCprocessed(); Console.WriteLine("\n{0}\n", tdl); // Extract collapsed dependencies from parsed tree var tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree); }
public static void Main(string[] args) { string input = null; string output = null; IList <string> extraArgs = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { input = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { output = args[argIndex + 1]; argIndex += 2; } else { extraArgs.Add(args[argIndex++]); } } } LexicalizedParser parser = LexicalizedParser.LoadModel(input, extraArgs); parser.SaveParserToSerialized(output); }
public Query(DVModelReranker _enclosing) { this._enclosing = _enclosing; this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op); this.scorer = new DVParserCostAndGradient(null, null, this._enclosing.model, this._enclosing.op); this.deepTrees = Generics.NewArrayList(); }
public CacheProcessor(CacheParseHypotheses cacher, LexicalizedParser parser, int dvKBest, ITreeTransformer transformer) { this.cacher = cacher; this.parser = parser; this.dvKBest = dvKBest; this.transformer = transformer; }
/// <summary> /// Parse the document searching for sentences where the entity found. /// Returns a csv line with the file, the entity the sentence and the sintax analisis of the sentences /// </summary> /// <param name="text">Document text</param> /// <param name="entity">Entity.</param> /// <param name="origFile">Original file.</param> public static List <string[]> Parse(string text, string entity, string origFile, string language) { var results = new List <string[]>(); //Load spanish models. var modelsDirectory = StanfordEnv.PARSER_MODELS; var lexparserDirectory = modelsDirectory + StanfordEnv.GetParserLanguageFiles(language); var lp = LexicalizedParser.loadModel(lexparserDirectory); string[] splittedText = SplitText(text); List <string> entityLines = GetEntitiesLines(splittedText, entity); foreach (var line in entityLines) { //Parser sentence. var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new java.io.StringReader(line); var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); sent2Reader.close(); var tree2 = lp.apply(rawWords2); results.Add(new string[] { origFile, entity, line, tree2.ToString() }); } return(results); }
/// <summary> /// demoDP demonstrates turning a file into tokens and then parse /// trees. /// </summary> /// <remarks> /// demoDP demonstrates turning a file into tokens and then parse /// trees. Note that the trees are printed by calling pennPrint on /// the Tree object. It is also possible to pass a PrintWriter to /// pennPrint if you want to capture the output. /// This code will work with any supported language. /// </remarks> public static void DemoDP(LexicalizedParser lp, string filename) { // This option shows loading, sentence-segmenting and tokenizing // a file using DocumentPreprocessor. ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // a PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = null; if (tlp.SupportsGrammaticalStructures()) { gsf = tlp.GrammaticalStructureFactory(); } // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename)) { Tree parse = lp.Apply(sentence); parse.PennPrint(); System.Console.Out.WriteLine(); if (gsf != null) { GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); ICollection tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); } } }
public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream) { op = new Options(); op.tlpParams = new ArabicTreebankParserParams(); op.SetOptions("-arabicFactored"); op.testOptions.maxLength = maxSentLen; op.testOptions.MaxItems = 5000000; //500000 is the default for Arabic, but we have substantially more edges now op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies"; // WSG: Just set this to some high value so that extractBestParse() // actually calls the lattice reader (e.g., this says that we can't have a word longer than // 80 characters...seems sensible for Arabic op.testOptions.maxSpanForTags = 80; treePrint = op.testOptions.TreePrint(op.tlpParams); debinarizer = new Debinarizer(op.forceCNF, new CategoryWordTagFactory()); subcategoryStripper = op.tlpParams.SubcategoryStripper(); Timing.StartTime(); Treebank trainTreebank = op.tlpParams.DiskTreebank(); trainTreebank.LoadPath(trainTreebankFile); lp = GetParserDataFromTreebank(trainTreebank); MakeParsers(); if (Verbose) { op.Display(); string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty; log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings"); log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules ); log.Info("ParserPack is " + op.tlpParams.GetType().FullName); log.Info("Lexicon is " + lp.lex.GetType().FullName); } return(Parse(inputStream)); }
public virtual LexicalizedParser AttachModelToLexicalizedParser() { LexicalizedParser newParser = LexicalizedParser.CopyLexicalizedParser(parser); DVModelReranker reranker = new DVModelReranker(dvModel); newParser.reranker = reranker; return(newParser); }
public virtual void SaveModel(string filename) { log.Info("Saving serialized model to " + filename); LexicalizedParser newParser = AttachModelToLexicalizedParser(); newParser.SaveParserToSerialized(filename); log.Info("... done"); }
public NpletParser() { //---THIS IS UGLY FIX FOR STANDFORD PARSER--- CultureInfo ci = new CultureInfo("en-US"); Thread.CurrentThread.CurrentCulture = ci; Thread.CurrentThread.CurrentUICulture = ci; _parser = LexicalizedParser.loadModel("../../../StanfordModels/englishPCFG.ser.gz"); //------------------------------------------- }
public Query(CombinedDVModelReranker _enclosing) { this._enclosing = _enclosing; this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op); this.scorers = Generics.NewArrayList(); foreach (DVModel model in this._enclosing.models) { this.scorers.Add(new DVParserCostAndGradient(null, null, model, this._enclosing.op)); } }
public static DVModel GetModelFromLexicalizedParser(LexicalizedParser parser) { if (!(parser.reranker is DVModelReranker)) { throw new ArgumentException("This parser does not contain a DVModel reranker"); } DVModelReranker reranker = (DVModelReranker)parser.reranker; return(reranker.GetModel()); }
/// <summary> /// Checks the a list of sentences for correct grammar. Returns a new list of the sentences with correct grammar. /// </summary> /// <param name="sentences">A list of strings that will have their grammar checked.</param> /// <returns>A new list of the sentences with correct grammar.</returns> public static HashSet <string> CheckGrammer(HashSet <string> sentences) { HashSet <string> answers = new HashSet <string>(); Console.WriteLine(sentences.Count + " potential sentences\n"); if (sentences.Count == 0) { return(null); } Console.WriteLine("Loading Stanford NLP..."); // Loading english PCFG parser from file var lp = LexicalizedParser.loadModel("..\\..\\..\\packages\\englishPCFG.ser.gz"); Console.WriteLine("Stanford Parser Loaded!\n"); //Test the grammar of each potential sentence that has all english words foreach (var curSentence in sentences) { //Convert the sentence to a tree that Stanford NLP library generates from its parsing var tempSentence = curSentence; var sentReader = new StringReader(tempSentence); var tree = lp.apply(PTBTokenizer.factory(new CoreLabelTokenFactory(), "").getTokenizer(sentReader).tokenize()); sentReader.close(); //Determine if the word is a sentence string strTree = tree.ToString(); bool isSentence = false; if (strTree.Contains("(S ")) { tempSentence = curSentence + "."; isSentence = true; } else if (strTree.Contains("(SINV ") || strTree.Contains("(SBARQ ") || strTree.Contains("(SQ ")) { tempSentence = curSentence + "?"; isSentence = true; } if (isSentence) { var tlp = new PennTreebankLanguagePack(); string strRel = tlp.grammaticalStructureFactory().newGrammaticalStructure(tree).typedDependenciesCCprocessed().ToString(); if (strRel.Contains("nsubj(")) { answers.Add(tempSentence); } } } return(answers); }
public static void Main(string[] args) { string parserFile = null; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { parserFile = args[argIndex + 1]; argIndex += 2; } else { string error = "Unknown argument " + args[argIndex]; log.Info(error); throw new Exception(error); } } if (parserFile == null) { log.Info("Must specify a model file with -model"); System.Environment.Exit(2); } LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserFile)); ICollection <string> tags = Generics.NewTreeSet(); foreach (string tag in parser.tagIndex) { tags.Add(parser.TreebankLanguagePack().BasicCategory(tag)); } System.Console.Out.WriteLine("Basic tags: " + tags.Count); foreach (string tag_1 in tags) { System.Console.Out.Write(" " + tag_1); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("All tags size: " + parser.tagIndex.Size()); ICollection <string> states = Generics.NewTreeSet(); foreach (string state in parser.stateIndex) { states.Add(parser.TreebankLanguagePack().BasicCategory(state)); } System.Console.Out.WriteLine("Basic states: " + states.Count); foreach (string tag_2 in states) { System.Console.Out.Write(" " + tag_2); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("All states size: " + parser.stateIndex.Size()); System.Console.Out.WriteLine("Unary grammar size: " + parser.ug.NumRules()); System.Console.Out.WriteLine("Binary grammar size: " + parser.bg.NumRules()); }
public static void Start(string fileName) { LexicalizedParser lp = LexicalizedParser.loadModel(/*Program.ParserModel*/); if (!String.IsNullOrEmpty(fileName)) { DemoDP(lp, fileName); } else { DemoAPI(lp); } }
public NlpService() { string parserFileOrUrl = "englishPCFG.ser.gz"; _lp = LexicalizedParser.loadModel(parserFileOrUrl); if (_lp == null) { throw new InvalidOperationException("couldn't load " + parserFileOrUrl); } _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); _tlp = new PennTreebankLanguagePack(); _structureFactory = _tlp.grammaticalStructureFactory(); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { string taggerFile = null; string inputFile = null; string outputFile = null; double weight = 1.0; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger")) { taggerFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-weight")) { weight = double.ValueOf(args[argIndex + 1]); argIndex += 2; } else { throw new ArgumentException("Unknown argument: " + args[argIndex]); } } } } } LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(inputFile)); MaxentTagger tagger = new MaxentTagger(taggerFile); parser.reranker = new TaggerReranker(tagger, parser.GetOp()); parser.SaveParserToSerialized(outputFile); }
public virtual void RunTest(string[] args) { // get a parser from file LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0])); op = pd.GetOp(); // in case a serialized options was read in Treebank testTreebank = op.tlpParams.MemoryTreebank(); int testlow = System.Convert.ToInt32(args[2]); int testhigh = System.Convert.ToInt32(args[3]); testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true)); op.SetOptionsOrWarn(args, 4, args.Length); TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex); }
public void LexicalizedParserTest() { // GZIPed model in the file var model = Files.Parser.Models("lexparser/englishPCFG.ser.gz"); using var fs = new FileStream(model, FileMode.Open); using var isw = new ikvm.io.InputStreamWrapper(fs); using var ois = model.EndsWith(".gz") ? new ObjectInputStream(new GZIPInputStream(isw)) : new ObjectInputStream(isw); var lp = LexicalizedParser.loadModel(ois); Assert.NotNull(lp); }
public static void DemoDP(LexicalizedParser lp, string fileName) { // This option shows loading and sentence-segment and tokenizing // a file using DocumentPreprocessor var tlp = new PennTreebankLanguagePack(); var gsf = tlp.grammaticalStructureFactory(); // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor foreach (List sentence in new DocumentPreprocessor(fileName)) { var parse = lp.apply(sentence); parse.pennPrint(); var gs = gsf.newGrammaticalStructure(parse); var tdl = gs.typedDependenciesCCprocessed(true); System.Console.WriteLine("\n{0}\n", tdl); } }
public static void DemoDP(LexicalizedParser lp, string fileName) { // This option shows loading and sentence-segment and tokenizing // a file using DocumentPreprocessor var tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor foreach (List sentence in new DocumentPreprocessor(fileName)) { Tree parse = lp.apply(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); java.util.List tdl = gs.typedDependenciesCCprocessed(true); Console.WriteLine("\n{0}\n", tdl); } }
internal static IdentityHashMap <Tree, IList <Tree> > GetTopParses(LexicalizedParser parser, Options op, ICollection <Tree> trees, ITreeTransformer transformer, bool outputUpdates) { IdentityHashMap <Tree, IList <Tree> > topParses = new IdentityHashMap <Tree, IList <Tree> >(); foreach (Tree tree in trees) { IList <Tree> parses = GetTopParsesForOneTree(parser, op.trainOptions.dvKBest, tree, transformer); topParses[tree] = parses; if (outputUpdates && topParses.Count % 10 == 0) { log.Info("Processed " + topParses.Count + " trees"); } } if (outputUpdates) { log.Info("Finished processing " + topParses.Count + " trees"); } return(topParses); }
public static List <string> ExtractNounsFromSemantics(string sentence) { string assemblyPath = Assembly.GetExecutingAssembly().GetName().CodeBase; string projectPath = Directory.GetParent(new Uri(Path.GetDirectoryName(Path.GetDirectoryName(Path.GetDirectoryName(assemblyPath)))).LocalPath).FullName; string modelsDirectory = Path.GetFullPath(projectPath + @"\Parser\CoreNLP-3.9.1-Models\edu\stanford\nlp\models"); // Loading english PCFG parser from file LexicalizedParser lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz"); // This shows loading and using an explicit tokenizer var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new java.io.StringReader(sentence); var rawWords = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); sent2Reader.close(); var tree = lp.apply(rawWords); return(tree.toArray().Cast <LabeledScoredTreeNode>().Where(n => n.isLeaf() && nounLabels.Contains(n.parent(tree).label().value())).Select(n => n.label().ToString()).ToList()); }
public DVParser(LexicalizedParser parser) { this.parser = parser; this.op = parser.GetOp(); if (op.trainOptions.randomSeed == 0) { op.trainOptions.randomSeed = Runtime.NanoTime(); log.Info("Random seed not set, using randomly chosen seed of " + op.trainOptions.randomSeed); } else { log.Info("Random seed set to " + op.trainOptions.randomSeed); } log.Info("Word vector file: " + op.lexOptions.wordVectorFile); log.Info("Size of word vectors: " + op.lexOptions.numHid); log.Info("Number of hypothesis trees to train against: " + op.trainOptions.dvKBest); log.Info("Number of trees in one batch: " + op.trainOptions.batchSize); log.Info("Number of iterations of trees: " + op.trainOptions.trainingIterations); log.Info("Number of qn iterations per batch: " + op.trainOptions.qnIterationsPerBatch); log.Info("Learning rate: " + op.trainOptions.learningRate); log.Info("Delta margin: " + op.trainOptions.deltaMargin); log.Info("regCost: " + op.trainOptions.regCost); log.Info("Using unknown word vector for numbers: " + op.trainOptions.unknownNumberVector); log.Info("Using unknown dashed word vector heuristics: " + op.trainOptions.unknownDashedWordVectors); log.Info("Using unknown word vector for capitalized words: " + op.trainOptions.unknownCapsVector); log.Info("Using unknown number vector for Chinese words: " + op.trainOptions.unknownChineseNumberVector); log.Info("Using unknown year vector for Chinese words: " + op.trainOptions.unknownChineseYearVector); log.Info("Using unknown percent vector for Chinese words: " + op.trainOptions.unknownChinesePercentVector); log.Info("Initial matrices scaled by: " + op.trainOptions.scalingForInit); log.Info("Training will use " + op.trainOptions.trainingThreads + " thread(s)"); log.Info("Context words are " + ((op.trainOptions.useContextWords) ? "on" : "off")); log.Info("Model will " + ((op.trainOptions.dvSimplifiedModel) ? string.Empty : "not ") + "be simplified"); this.dvModel = new DVModel(op, parser.stateIndex, parser.ug, parser.bg); if (dvModel.unaryTransform.Count != dvModel.unaryScore.Count) { throw new AssertionError("Unary transform and score size not the same"); } if (dvModel.binaryTransform.Size() != dvModel.binaryScore.Size()) { throw new AssertionError("Binary transform and score size not the same"); } }
public bool LoadModel(string modelFile) { try { _sdpModel = LexicalizedParser.loadModel(modelFile); _isSDPModelLoaded = true; return true; } catch { System.Console.WriteLine("Uable to load the Model englishPCFG.ser.gz... "); _isSDPModelLoaded = false; return false; } }
public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words var sent = new[] { "This", "is", "an", "easy", "sentence", "." }; var rawWords = Sentence.toCoreLabelList(sent); var parse = lp.apply(rawWords); parse.pennPrint(); // This option shows loading and using an explicit tokenizer const string Sent2 = "This is another sentence."; var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new StringReader(Sent2); var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); parse = lp.apply(rawWords2); var tlp = new PennTreebankLanguagePack(); var gsf = tlp.grammaticalStructureFactory(); var gs = gsf.newGrammaticalStructure(parse); var tdl = gs.typedDependenciesCCprocessed(); System.Console.WriteLine("\n{0}\n", tdl); var tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }