/// <summary> /// Saves the results of applying the parser to the current text to /// the specified filename. /// </summary> public virtual void SaveOutput(string filename) { if (filename == null || filename.Equals(string.Empty)) { return; } string text = textPane.GetText(); StringReader reader = new StringReader(text); DocumentPreprocessor processor = new DocumentPreprocessor(reader); ITokenizerFactory <IHasWord> tf = tlp.GetTokenizerFactory(); processor.SetTokenizerFactory(tf); IList <IList <IHasWord> > sentences = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in processor) { sentences.Add(sentence); } JProgressBar progress = new JProgressBar(0, sentences.Count); JButton cancel = new JButton(); JDialog dialog = new JDialog(new Frame(), "Parser Progress", true); dialog.SetSize(300, 150); dialog.Add(BorderLayout.North, new JLabel("Parsing " + sentences.Count + " sentences")); dialog.Add(BorderLayout.Center, progress); dialog.Add(BorderLayout.South, cancel); //dialog.add(progress); ParserPanel.SaveOutputThread thread = new ParserPanel.SaveOutputThread(this, filename, progress, dialog, cancel, sentences); cancel.SetText("Cancel"); cancel.SetToolTipText("Cancel"); cancel.AddActionListener(null); thread.Start(); dialog.SetVisible(true); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { if (args.Length != 2) { log.Info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory); foreach (IList <IHasWord> sentence in documentPreprocessor) { IList <TaggedWord> tSentence = tagger.TagSentence(sentence); pw.Println(SentenceUtils.ListToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. IList <IHasWord> sent = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); IList <TaggedWord> taggedSent = tagger.TagSentence(sent); foreach (TaggedWord tw in taggedSent) { if (tw.Tag().StartsWith("JJ")) { pw.Println(tw.Word()); } } pw.Close(); }
public static void Execute(string fileName) { var tagger = new MaxentTagger(TaggerDemo.Model); var ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); var r = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "utf-8")); var documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); foreach (List sentence in documentPreprocessor) { var tSentence = tagger.tagSentence(sentence); System.Console.WriteLine(Sentence.listToString(tSentence, false)); } }
public static List <string> Go(string input) { java.io.Reader reader = new java.io.StringReader(input); DocumentPreprocessor dp = new DocumentPreprocessor(reader); dp.setTokenizerFactory(TokenizerFactory); List <string> output = new List <string>(); foreach (java.util.List sentence in dp) { output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence)); } return(output); }
// static main method only public static void Main(string[] args) { string modelPath = DependencyParser.DefaultModel; string taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; for (int argIndex = 0; argIndex < args.Length;) { switch (args[argIndex]) { case "-tagger": { taggerPath = args[argIndex + 1]; argIndex += 2; break; } case "-model": { modelPath = args[argIndex + 1]; argIndex += 2; break; } default: { throw new Exception("Unknown argument " + args[argIndex]); } } } string text = "I can almost always tell when movies use fake dinosaurs."; MaxentTagger tagger = new MaxentTagger(taggerPath); DependencyParser parser = DependencyParser.LoadFromModelFile(modelPath); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); foreach (IList <IHasWord> sentence in tokenizer) { IList <TaggedWord> tagged = tagger.TagSentence(sentence); GrammaticalStructure gs = parser.Predict(tagged); // Print typed dependencies log.Info(gs); } }
public static void Main(string[] args) { string modelPath = "edu/stanford/nlp/models/srparser/englishSR.ser.gz"; string taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; for (int argIndex = 0; argIndex < args.Length;) { switch (args[argIndex]) { case "-tagger": { taggerPath = args[argIndex + 1]; argIndex += 2; break; } case "-model": { modelPath = args[argIndex + 1]; argIndex += 2; break; } default: { throw new Exception("Unknown argument " + args[argIndex]); } } } string text = "My dog likes to shake his stuffed chickadee toy."; MaxentTagger tagger = new MaxentTagger(taggerPath); ShiftReduceParser model = ((ShiftReduceParser)ShiftReduceParser.LoadModel(modelPath)); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); foreach (IList <IHasWord> sentence in tokenizer) { IList <TaggedWord> tagged = tagger.TagSentence(sentence); Tree tree = model.Apply(tagged); log.Info(tree); } }
public void LoadSentencesFromFile() { // This option shows loading and sentence-segment and tokenizing // a file using DocumentPreprocessor var tlp = new PennTreebankLanguagePack(); var gsf = tlp.grammaticalStructureFactory(); // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor var preprocessor = new DocumentPreprocessor(Files.DataFile("SampleText.txt")); foreach (var sentence in preprocessor.ToSeq().Cast <List>()) { var parse = _lp.apply(sentence); Assert.NotNull(parse); parse.pennPrint(); var gs = gsf.newGrammaticalStructure(parse); var tdl = gs.typedDependenciesCCprocessed(true); TestContext.Out.WriteLine($"\n{tdl}\n"); } }
/// <exception cref="System.Exception"/> public virtual Document MakeDocument(InputDoc input) { IList <IList <Mention> > mentions = new List <IList <Mention> >(); if (CorefProperties.UseGoldMentions(props)) { IList <ICoreMap> sentences = input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; IList <CoreLabel> sentenceWords = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <Mention> sentenceMentions = new List <Mention>(); mentions.Add(sentenceMentions); foreach (Mention g in input.goldMentions[i]) { sentenceMentions.Add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords, null, null, new List <CoreLabel>(sentenceWords.SubList(g.startIndex, g.endIndex)))); } md.FindHead(sentence, sentenceMentions); } } else { foreach (ICoreMap sentence in input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { mentions.Add(sentence.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))); } } Document doc = new Document(input, mentions); if (input.goldMentions != null) { FindGoldMentionHeads(doc); } DocumentPreprocessor.Preprocess(doc, dict, null, headFinder); return(doc); }
/// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string modelPath = null; string outputPath = null; string inputPath = null; string testTreebankPath = null; IFileFilter testTreebankFilter = null; IList <string> unusedArgs = Generics.NewArrayList(); for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model")) { modelPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank")) { Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank"); argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1; testTreebankPath = treebankDescription.First(); testTreebankFilter = treebankDescription.Second(); } else { unusedArgs.Add(args[argIndex++]); } } } } } string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]); LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs)); DVModel model = DVParser.GetModelFromLexicalizedParser(parser); File outputFile = new File(outputPath); FileSystem.CheckNotExistsOrFail(outputFile); FileSystem.MkdirOrFail(outputFile); int count = 0; if (inputPath != null) { Reader input = new BufferedReader(new FileReader(inputPath)); DocumentPreprocessor processor = new DocumentPreprocessor(input); foreach (IList <IHasWord> sentence in processor) { count++; // index from 1 IParserQuery pq = parser.ParserQuery(); if (!(pq is RerankingParserQuery)) { throw new ArgumentException("Expected a RerankingParserQuery"); } RerankingParserQuery rpq = (RerankingParserQuery)pq; if (!rpq.Parse(sentence)) { throw new Exception("Unparsable sentence: " + sentence); } IRerankerQuery reranker = rpq.RerankerQuery(); if (!(reranker is DVModelReranker.Query)) { throw new ArgumentException("Expected a DVModelReranker"); } DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0]; IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors(); foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors) { log.Info(entry.Key + " " + entry.Value); } FileWriter fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt"); BufferedWriter bout = new BufferedWriter(fout); bout.Write(SentenceUtils.ListToString(sentence)); bout.NewLine(); bout.Write(deepTree.GetTree().ToString()); bout.NewLine(); foreach (IHasWord word in sentence) { OutputMatrix(bout, model.GetWordVector(word.Word())); } Tree rootTree = FindRootTree(vectors); OutputTreeMatrices(bout, rootTree, vectors); bout.Flush(); fout.Close(); } } }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }
/// <summary>Finds the nearest delimiter starting from index start.</summary> /// <remarks> /// Finds the nearest delimiter starting from index start. If <tt>seekDir</tt> /// is SEEK_FORWARD, finds the nearest delimiter after start. Else, if it is /// SEEK_BACK, finds the nearest delimiter before start. /// </remarks> private int NearestDelimiter(string text, int start, int seekDir) { if (seekDir != SeekBack && seekDir != SeekForward) { throw new ArgumentException("Unknown seek direction " + seekDir); } StringReader reader = new StringReader(text); DocumentPreprocessor processor = new DocumentPreprocessor(reader); ITokenizerFactory <IHasWord> tf = tlp.GetTokenizerFactory(); processor.SetTokenizerFactory(tf); IList <int> boundaries = new List <int>(); foreach (IList <IHasWord> sentence in processor) { if (sentence.Count == 0) { continue; } if (!(sentence[0] is IHasOffset)) { throw new InvalidCastException("Expected HasOffsets from the " + "DocumentPreprocessor"); } if (boundaries.Count == 0) { boundaries.Add(0); } else { IHasOffset first = (IHasOffset)sentence[0]; boundaries.Add(first.BeginPosition()); } } boundaries.Add(text.Length); for (int i = 0; i < boundaries.Count - 1; ++i) { if (boundaries[i] <= start && start < boundaries[i + 1]) { if (seekDir == SeekBack) { return(boundaries[i] - 1); } else { if (seekDir == SeekForward) { return(boundaries[i + 1] - 1); } } } } // The cursor position at the end is actually one past the text length. // We might as well highlight the last interval in that case. if (boundaries.Count >= 2 && start >= text.Length) { if (seekDir == SeekBack) { return(boundaries[boundaries.Count - 2] - 1); } else { if (seekDir == SeekForward) { return(boundaries[boundaries.Count - 1] - 1); } } } return(-1); }
/// <summary>This example shows a few more ways of providing input to a parser.</summary> /// <remarks> /// This example shows a few more ways of providing input to a parser. /// Usage: ParserDemo2 [grammar [textFile]] /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string[] options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" }; LexicalizedParser lp = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options)); ITreebankLanguagePack tlp = lp.GetOp().Langpack(); IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); IEnumerable <IList <IHasWord> > sentences; if (args.Length > 1) { DocumentPreprocessor dp = new DocumentPreprocessor(args[1]); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in dp) { tmp.Add(sentence); } sentences = tmp; } else { // Showing tokenization and parsing in code a couple of different ways. string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <IHasWord> sentence = new List <IHasWord>(); foreach (string word in sent) { sentence.Add(new Word(word)); } string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization."); // Use the default tokenizer for this TreebankLanguagePack ITokenizer <IHasWord> toke = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2)); IList <IHasWord> sentence2 = toke.Tokenize(); string[] sent3 = new string[] { "It", "can", "can", "it", "." }; string[] tag3 = new string[] { "PRP", "MD", "VB", "PRP", "." }; // Parser gets second "can" wrong without help IList <TaggedWord> sentence3 = new List <TaggedWord>(); for (int i = 0; i < sent3.Length; i++) { sentence3.Add(new TaggedWord(sent3[i], tag3[i])); } Tree parse = lp.Parse(sentence3); parse.PennPrint(); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); tmp.Add(sentence); tmp.Add(sentence2); tmp.Add(sentence3); sentences = tmp; } foreach (IList <IHasWord> sentence_1 in sentences) { Tree parse = lp.Parse(sentence_1); parse.PennPrint(); System.Console.Out.WriteLine(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); System.Console.Out.WriteLine("The words of the sentence:"); foreach (ILabel lab in parse.Yield()) { if (lab is CoreLabel) { System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap)); } else { System.Console.Out.WriteLine(lab); } } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(parse.TaggedYield()); System.Console.Out.WriteLine(); } // This method turns the String into a single sentence using the // default tokenizer for the TreebankLanguagePack. string sent3_1 = "This is one last test!"; lp.Parse(sent3_1).PennPrint(); }
public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter ) where _T0 : IHasWord { DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; if (op.testOptions.verbose) { if (tokenizerFactory != null) { pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory); } } Timing timer = new Timing(); // timer.start(); // constructor already starts it. //Loop over the files for (int i = argIndex; i < args.Length; i++) { string filename = args[i]; DocumentPreprocessor documentPreprocessor; if (filename.Equals("-")) { try { documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType); } catch (IOException e) { throw new RuntimeIOException(e); } } else { documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding()); } //Unused values are null per the main() method invocation below //null is the default for these properties documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords()); documentPreprocessor.SetEscaper(escaper); documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter); documentPreprocessor.SetTagDelimiter(tagDelimiter); documentPreprocessor.SetElementDelimiter(elementDelimiter); if (tokenizerFactory == null) { documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory()); } else { documentPreprocessor.SetTokenizerFactory(tokenizerFactory); } //Setup the output PrintWriter pwo = pwOut; if (op.testOptions.writeOutputFiles) { string normalizedName = filename; try { new URL(normalizedName); // this will exception if not a URL normalizedName = normalizedName.ReplaceAll("/", "_"); } catch (MalformedURLException) { } //It isn't a URL, so silently ignore string ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension; string fname = normalizedName + '.' + ext; if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty()) { string fseparator = Runtime.GetProperty("file.separator"); if (fseparator == null || fseparator.IsEmpty()) { fseparator = "/"; } File fnameFile = new File(fname); fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName(); } try { pwo = op.tlpParams.Pw(new FileOutputStream(fname)); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding()); pwErr.Println("Parsing file: " + filename); int num = 0; int numProcessed = 0; if (op.testOptions.testingThreads != 1) { MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr)); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); wrapper.Put(sentence); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } wrapper.Join(); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } else { IParserQuery pq = pqFactory.ParserQuery(); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); pq.ParseAndReport(sentence, pwErr); ProcessResults(pq, numProcessed++, pwo); } } treePrint.PrintFooter(pwo); if (op.testOptions.writeOutputFiles) { pwo.Close(); } pwErr.Println("Parsed file: " + filename + " [" + num + " sentences]."); } long millis = timer.Stop(); if (summary) { if (pcfgLL != null) { pcfgLL.Display(false, pwErr); } if (depLL != null) { depLL.Display(false, pwErr); } if (factLL != null) { factLL.Display(false, pwErr); } } if (saidMemMessage) { ParserUtils.PrintOutOfMemory(pwErr); } double wordspersec = numWords / (((double)millis) / 1000); double sentspersec = numSents / (((double)millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec)."); if (numFallback > 0) { pwErr.Println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.Println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.Println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.Println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.Println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength); } } }
/// <summary> /// Turns a text file into trees for use in a RNTN classifier such as /// the treebank used in the Sentiment project. /// </summary> /// <remarks> /// Turns a text file into trees for use in a RNTN classifier such as /// the treebank used in the Sentiment project. /// <br /> /// The expected input file is one sentence per line, with sentences /// separated by blank lines. The first line has the main label of the sentence together with the full sentence. /// Lines after the first sentence line but before /// the blank line will be treated as labeled sub-phrases. The /// labels should start with the label and then contain a list of /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! /// For example: /// <br /> /// <code> /// 1 Today is not a good day.<br /> /// 3 good<br /> /// 3 good day <br /> /// 3 a good day <br /> /// <br /> /// (next block starts here) <br /> /// </code> /// By default the englishPCFG parser is used. This can be changed /// with the /// <c>-parserModel</c> /// flag. Specify an input file /// with /// <c>-input</c> /// . /// <br /> /// If a sentiment model is provided with -sentimentModel, that model /// will be used to prelabel the sentences. Any spans with given /// labels will then be used to adjust those labels. /// </remarks> public static void Main(string[] args) { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); string parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string inputPath = null; string sentimentModelPath = null; SentimentModel sentimentModel = null; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { log.Info("Unknown argument " + args[argIndex]); System.Environment.Exit(2); } } } } if (inputPath == null) { throw new ArgumentException("Must specify input file with -input"); } LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel)); TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath); } string text = IOUtils.SlurpFileNoExceptions(inputPath); string[] chunks = text.Split("\\n\\s*\\n+"); // need blank line to make a new chunk foreach (string chunk in chunks) { if (chunk.Trim().IsEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. string[] lines = chunk.Trim().Split("\\n"); string sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.SetSentenceFinalPuncWords(new string[] { "\n" }); IList <IHasWord> tokens = document.GetEnumerator().Current; int mainLabel = System.Convert.ToInt32(tokens[0].Word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.SubList(1, tokens.Count); //log.info(tokens); IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap(); for (int i = 1; i < lines.Length; ++i) { ExtractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.Apply(tokens); Tree binarized = binarizer.TransformTree(tree); Tree collapsedUnary = transformer.TransformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.ForwardPropagateTree(collapsedUnary); SetPredictedLabels(collapsedUnary); } else { SetUnknownLabels(collapsedUnary, mainLabel); } Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary); collapsedUnary.IndexSpans(); foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels) { SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value); } System.Console.Out.WriteLine(collapsedUnary); } }
public Analyzer(string path, DocumentPreprocessor.DocType docType = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) : this() { _path = path; _docType = docType; _ignore = ignore; _options = options; _punctuation = punctuation ?? PunctuationPatterns; Open(); }