DocumentPreprocessor C# (CSharp) Code Examples

Example #1

0

Show file

File: ParserPanel.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <summary>
        /// Saves the results of applying the parser to the current text to
        /// the specified filename.
        /// </summary>
        public virtual void SaveOutput(string filename)
        {
            if (filename == null || filename.Equals(string.Empty))
            {
                return;
            }
            string                       text      = textPane.GetText();
            StringReader                 reader    = new StringReader(text);
            DocumentPreprocessor         processor = new DocumentPreprocessor(reader);
            ITokenizerFactory <IHasWord> tf        = tlp.GetTokenizerFactory();

            processor.SetTokenizerFactory(tf);
            IList <IList <IHasWord> > sentences = new List <IList <IHasWord> >();

            foreach (IList <IHasWord> sentence in processor)
            {
                sentences.Add(sentence);
            }
            JProgressBar progress = new JProgressBar(0, sentences.Count);
            JButton      cancel   = new JButton();
            JDialog      dialog   = new JDialog(new Frame(), "Parser Progress", true);

            dialog.SetSize(300, 150);
            dialog.Add(BorderLayout.North, new JLabel("Parsing " + sentences.Count + " sentences"));
            dialog.Add(BorderLayout.Center, progress);
            dialog.Add(BorderLayout.South, cancel);
            //dialog.add(progress);
            ParserPanel.SaveOutputThread thread = new ParserPanel.SaveOutputThread(this, filename, progress, dialog, cancel, sentences);
            cancel.SetText("Cancel");
            cancel.SetToolTipText("Cancel");
            cancel.AddActionListener(null);
            thread.Start();
            dialog.SetVisible(true);
        }

Example #2

0

Show file

File: TaggerDemo2.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo2 modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            BufferedReader       r  = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
            PrintWriter          pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8"));
            DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory);
            foreach (IList <IHasWord> sentence in documentPreprocessor)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                pw.Println(SentenceUtils.ListToString(tSentence, false));
            }
            // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
            IList <IHasWord>   sent       = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
            IList <TaggedWord> taggedSent = tagger.TagSentence(sent);

            foreach (TaggedWord tw in taggedSent)
            {
                if (tw.Tag().StartsWith("JJ"))
                {
                    pw.Println(tw.Word());
                }
            }
            pw.Close();
        }

Example #3

0

Show file

File: TaggerDemo2.cs Project: jliettehk/FSharp.NLP.Stanford

 public static void Execute(string fileName)
 {
     var tagger = new MaxentTagger(TaggerDemo.Model);
     var ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
     var r = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "utf-8"));
     var documentPreprocessor = new DocumentPreprocessor(r);
     documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
     foreach (List sentence in documentPreprocessor)
     {
         var tSentence = tagger.tagSentence(sentence);
         System.Console.WriteLine(Sentence.listToString(tSentence, false));
     }
 }

Example #4

0

Show file

        public static void Execute(string fileName)
        {
            var tagger = new MaxentTagger(TaggerDemo.Model);
            var ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            var r = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "utf-8"));
            var documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
            foreach (List sentence in documentPreprocessor)
            {
                var tSentence = tagger.tagSentence(sentence);
                System.Console.WriteLine(Sentence.listToString(tSentence, false));
            }
        }

Example #5

0

Show file

        public static List <string> Go(string input)
        {
            java.io.Reader       reader = new java.io.StringReader(input);
            DocumentPreprocessor dp     = new DocumentPreprocessor(reader);

            dp.setTokenizerFactory(TokenizerFactory);

            List <string> output = new List <string>();

            foreach (java.util.List sentence in dp)
            {
                output.Add(StringUtils.joinWithOriginalWhiteSpace(sentence));
            }

            return(output);
        }

Example #6

0

Show file

File: DependencyParserDemo.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        // static main method only
        public static void Main(string[] args)
        {
            string modelPath  = DependencyParser.DefaultModel;
            string taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

            for (int argIndex = 0; argIndex < args.Length;)
            {
                switch (args[argIndex])
                {
                case "-tagger":
                {
                    taggerPath = args[argIndex + 1];
                    argIndex  += 2;
                    break;
                }

                case "-model":
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                    break;
                }

                default:
                {
                    throw new Exception("Unknown argument " + args[argIndex]);
                }
                }
            }
            string               text      = "I can almost always tell when movies use fake dinosaurs.";
            MaxentTagger         tagger    = new MaxentTagger(taggerPath);
            DependencyParser     parser    = DependencyParser.LoadFromModelFile(modelPath);
            DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));

            foreach (IList <IHasWord> sentence in tokenizer)
            {
                IList <TaggedWord>   tagged = tagger.TagSentence(sentence);
                GrammaticalStructure gs     = parser.Predict(tagged);
                // Print typed dependencies
                log.Info(gs);
            }
        }

Example #7

0

Show file

File: ShiftReduceDemo.cs Project: zerouid/Stanford.CoreNLP.NET

        public static void Main(string[] args)
        {
            string modelPath  = "edu/stanford/nlp/models/srparser/englishSR.ser.gz";
            string taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

            for (int argIndex = 0; argIndex < args.Length;)
            {
                switch (args[argIndex])
                {
                case "-tagger":
                {
                    taggerPath = args[argIndex + 1];
                    argIndex  += 2;
                    break;
                }

                case "-model":
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                    break;
                }

                default:
                {
                    throw new Exception("Unknown argument " + args[argIndex]);
                }
                }
            }
            string               text      = "My dog likes to shake his stuffed chickadee toy.";
            MaxentTagger         tagger    = new MaxentTagger(taggerPath);
            ShiftReduceParser    model     = ((ShiftReduceParser)ShiftReduceParser.LoadModel(modelPath));
            DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));

            foreach (IList <IHasWord> sentence in tokenizer)
            {
                IList <TaggedWord> tagged = tagger.TagSentence(sentence);
                Tree tree = model.Apply(tagged);
                log.Info(tree);
            }
        }

Example #8

0

Show file

File: ParserTests.cs Project: rushabhdharia/Stanford.NLP.NET

        public void LoadSentencesFromFile()
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();

            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            var preprocessor = new DocumentPreprocessor(Files.DataFile("SampleText.txt"));

            foreach (var sentence in preprocessor.ToSeq().Cast <List>())
            {
                var parse = _lp.apply(sentence);
                Assert.NotNull(parse);
                parse.pennPrint();

                var gs  = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                TestContext.Out.WriteLine($"\n{tdl}\n");
            }
        }

Example #9

0

Show file

        /// <exception cref="System.Exception"/>
        public virtual Document MakeDocument(InputDoc input)
        {
            IList <IList <Mention> > mentions = new List <IList <Mention> >();

            if (CorefProperties.UseGoldMentions(props))
            {
                IList <ICoreMap> sentences = input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
                for (int i = 0; i < sentences.Count; i++)
                {
                    ICoreMap          sentence         = sentences[i];
                    IList <CoreLabel> sentenceWords    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    IList <Mention>   sentenceMentions = new List <Mention>();
                    mentions.Add(sentenceMentions);
                    foreach (Mention g in input.goldMentions[i])
                    {
                        sentenceMentions.Add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords, null, null, new List <CoreLabel>(sentenceWords.SubList(g.startIndex, g.endIndex))));
                    }
                    md.FindHead(sentence, sentenceMentions);
                }
            }
            else
            {
                foreach (ICoreMap sentence in input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    mentions.Add(sentence.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation)));
                }
            }
            Document doc = new Document(input, mentions);

            if (input.goldMentions != null)
            {
                FindGoldMentionHeads(doc);
            }
            DocumentPreprocessor.Preprocess(doc, dict, null, headFinder);
            return(doc);
        }

Example #10

0

Show file

File: ParseAndPrintMatrices.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         inputPath          = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        outputPath = args[argIndex + 1];
                        argIndex  += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                        {
                            inputPath = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                            {
                                Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                                argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                                testTreebankPath   = treebankDescription.First();
                                testTreebankFilter = treebankDescription.Second();
                            }
                            else
                            {
                                unusedArgs.Add(args[argIndex++]);
                            }
                        }
                    }
                }
            }
            string[]          newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser parser  = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            DVModel           model   = DVParser.GetModelFromLexicalizedParser(parser);
            File outputFile           = new File(outputPath);

            FileSystem.CheckNotExistsOrFail(outputFile);
            FileSystem.MkdirOrFail(outputFile);
            int count = 0;

            if (inputPath != null)
            {
                Reader input = new BufferedReader(new FileReader(inputPath));
                DocumentPreprocessor processor = new DocumentPreprocessor(input);
                foreach (IList <IHasWord> sentence in processor)
                {
                    count++;
                    // index from 1
                    IParserQuery pq = parser.ParserQuery();
                    if (!(pq is RerankingParserQuery))
                    {
                        throw new ArgumentException("Expected a RerankingParserQuery");
                    }
                    RerankingParserQuery rpq = (RerankingParserQuery)pq;
                    if (!rpq.Parse(sentence))
                    {
                        throw new Exception("Unparsable sentence: " + sentence);
                    }
                    IRerankerQuery reranker = rpq.RerankerQuery();
                    if (!(reranker is DVModelReranker.Query))
                    {
                        throw new ArgumentException("Expected a DVModelReranker");
                    }
                    DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0];
                    IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors();
                    foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors)
                    {
                        log.Info(entry.Key + "   " + entry.Value);
                    }
                    FileWriter     fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
                    BufferedWriter bout = new BufferedWriter(fout);
                    bout.Write(SentenceUtils.ListToString(sentence));
                    bout.NewLine();
                    bout.Write(deepTree.GetTree().ToString());
                    bout.NewLine();
                    foreach (IHasWord word in sentence)
                    {
                        OutputMatrix(bout, model.GetWordVector(word.Word()));
                    }
                    Tree rootTree = FindRootTree(vectors);
                    OutputTreeMatrices(bout, rootTree, vectors);
                    bout.Flush();
                    fout.Close();
                }
            }
        }

Example #11

0

Show file

File: AnnotatedTextReader.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        /// <exception cref="System.IO.IOException"/>
        public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            Pattern          startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            Pattern          endLabelToken      = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            string           backgroundSymbol   = "O";
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            int    lineNum = -1;
            string l       = null;

            while ((l = reader.ReadLine()) != null)
            {
                lineNum++;
                string[] t    = l.Split("\t", 2);
                string   id   = null;
                string   text = null;
                if (t.Length == 2)
                {
                    id   = t[0];
                    text = t[1];
                }
                else
                {
                    if (t.Length == 1)
                    {
                        text = t[0];
                        id   = lineNum.ToString();
                    }
                }
                id = sentIDprefix + id;
                DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
                PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
                dp.SetTokenizerFactory(tokenizerFactory);
                string label   = backgroundSymbol;
                int    sentNum = -1;
                foreach (IList <IHasWord> sentence in dp)
                {
                    sentNum++;
                    string            sentStr = string.Empty;
                    IList <CoreLabel> sent    = new List <CoreLabel>();
                    foreach (IHasWord tokw in sentence)
                    {
                        string  tok             = tokw.Word();
                        Matcher startingMatcher = startingLabelToken.Matcher(tok);
                        Matcher endMatcher      = endLabelToken.Matcher(tok);
                        if (startingMatcher.Matches())
                        {
                            //System.out.println("matched starting");
                            label = startingMatcher.Group(1);
                        }
                        else
                        {
                            if (endMatcher.Matches())
                            {
                                //System.out.println("matched end");
                                label = backgroundSymbol;
                            }
                            else
                            {
                                CoreLabel      c    = new CoreLabel();
                                IList <string> toks = new List <string>();
                                toks.Add(tok);
                                foreach (string toksplit in toks)
                                {
                                    sentStr += " " + toksplit;
                                    c.SetWord(toksplit);
                                    c.SetLemma(toksplit);
                                    c.SetValue(toksplit);
                                    c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit);
                                    c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok);
                                    if (setGoldClass)
                                    {
                                        c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                                    }
                                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                                    {
                                        c.Set(setClassForTheseLabels[label], label);
                                    }
                                    sent.Add(c);
                                }
                            }
                        }
                    }
                    ICoreMap sentcm = new ArrayCoreMap();
                    sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim());
                    sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent);
                    sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum);
                    sentences.Add(sentcm);
                }
            }
            return(sentences);
        }

Example #12

0

Show file

File: ParserPanel.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <summary>Finds the nearest delimiter starting from index start.</summary>
        /// <remarks>
        /// Finds the nearest delimiter starting from index start. If <tt>seekDir</tt>
        /// is SEEK_FORWARD, finds the nearest delimiter after start.  Else, if it is
        /// SEEK_BACK, finds the nearest delimiter before start.
        /// </remarks>
        private int NearestDelimiter(string text, int start, int seekDir)
        {
            if (seekDir != SeekBack && seekDir != SeekForward)
            {
                throw new ArgumentException("Unknown seek direction " + seekDir);
            }
            StringReader                 reader    = new StringReader(text);
            DocumentPreprocessor         processor = new DocumentPreprocessor(reader);
            ITokenizerFactory <IHasWord> tf        = tlp.GetTokenizerFactory();

            processor.SetTokenizerFactory(tf);
            IList <int> boundaries = new List <int>();

            foreach (IList <IHasWord> sentence in processor)
            {
                if (sentence.Count == 0)
                {
                    continue;
                }
                if (!(sentence[0] is IHasOffset))
                {
                    throw new InvalidCastException("Expected HasOffsets from the " + "DocumentPreprocessor");
                }
                if (boundaries.Count == 0)
                {
                    boundaries.Add(0);
                }
                else
                {
                    IHasOffset first = (IHasOffset)sentence[0];
                    boundaries.Add(first.BeginPosition());
                }
            }
            boundaries.Add(text.Length);
            for (int i = 0; i < boundaries.Count - 1; ++i)
            {
                if (boundaries[i] <= start && start < boundaries[i + 1])
                {
                    if (seekDir == SeekBack)
                    {
                        return(boundaries[i] - 1);
                    }
                    else
                    {
                        if (seekDir == SeekForward)
                        {
                            return(boundaries[i + 1] - 1);
                        }
                    }
                }
            }
            // The cursor position at the end is actually one past the text length.
            // We might as well highlight the last interval in that case.
            if (boundaries.Count >= 2 && start >= text.Length)
            {
                if (seekDir == SeekBack)
                {
                    return(boundaries[boundaries.Count - 2] - 1);
                }
                else
                {
                    if (seekDir == SeekForward)
                    {
                        return(boundaries[boundaries.Count - 1] - 1);
                    }
                }
            }
            return(-1);
        }

Example #13

0

Show file

File: ParserDemo2.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <summary>This example shows a few more ways of providing input to a parser.</summary>
        /// <remarks>
        /// This example shows a few more ways of providing input to a parser.
        /// Usage: ParserDemo2 [grammar [textFile]]
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

            string[]                        options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" };
            LexicalizedParser               lp      = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options));
            ITreebankLanguagePack           tlp     = lp.GetOp().Langpack();
            IGrammaticalStructureFactory    gsf     = tlp.GrammaticalStructureFactory();
            IEnumerable <IList <IHasWord> > sentences;

            if (args.Length > 1)
            {
                DocumentPreprocessor      dp  = new DocumentPreprocessor(args[1]);
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                foreach (IList <IHasWord> sentence in dp)
                {
                    tmp.Add(sentence);
                }
                sentences = tmp;
            }
            else
            {
                // Showing tokenization and parsing in code a couple of different ways.
                string[]         sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
                IList <IHasWord> sentence = new List <IHasWord>();
                foreach (string word in sent)
                {
                    sentence.Add(new Word(word));
                }
                string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
                // Use the default tokenizer for this TreebankLanguagePack
                ITokenizer <IHasWord> toke      = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2));
                IList <IHasWord>      sentence2 = toke.Tokenize();
                string[] sent3 = new string[] { "It", "can", "can", "it", "." };
                string[] tag3  = new string[] { "PRP", "MD", "VB", "PRP", "." };
                // Parser gets second "can" wrong without help
                IList <TaggedWord> sentence3 = new List <TaggedWord>();
                for (int i = 0; i < sent3.Length; i++)
                {
                    sentence3.Add(new TaggedWord(sent3[i], tag3[i]));
                }
                Tree parse = lp.Parse(sentence3);
                parse.PennPrint();
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                tmp.Add(sentence);
                tmp.Add(sentence2);
                tmp.Add(sentence3);
                sentences = tmp;
            }
            foreach (IList <IHasWord> sentence_1 in sentences)
            {
                Tree parse = lp.Parse(sentence_1);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                GrammaticalStructure    gs  = gsf.NewGrammaticalStructure(parse);
                IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed();
                System.Console.Out.WriteLine(tdl);
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine("The words of the sentence:");
                foreach (ILabel lab in parse.Yield())
                {
                    if (lab is CoreLabel)
                    {
                        System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap));
                    }
                    else
                    {
                        System.Console.Out.WriteLine(lab);
                    }
                }
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine(parse.TaggedYield());
                System.Console.Out.WriteLine();
            }
            // This method turns the String into a single sentence using the
            // default tokenizer for the TreebankLanguagePack.
            string sent3_1 = "This is one last test!";

            lp.Parse(sent3_1).PennPrint();
        }

Example #14

0

Show file

File: ParseFiles.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

 public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                      )
 where _T0 : IHasWord
     {
      DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
      if (op.testOptions.verbose)
     {
         if (tokenizerFactory != null)
         {
             pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
         }
     }
      Timing timer = new Timing();
      // timer.start(); // constructor already starts it.
      //Loop over the files
      for (int i = argIndex; i < args.Length; i++)
     {
         string filename = args[i];
         DocumentPreprocessor documentPreprocessor;
         if (filename.Equals("-"))
         {
             try
             {
                 documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType);
             }
             catch (IOException e)
             {
                 throw new RuntimeIOException(e);
             }
         }
         else
         {
             documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding());
         }
         //Unused values are null per the main() method invocation below
         //null is the default for these properties
         documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords());
         documentPreprocessor.SetEscaper(escaper);
         documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
         documentPreprocessor.SetTagDelimiter(tagDelimiter);
         documentPreprocessor.SetElementDelimiter(elementDelimiter);
         if (tokenizerFactory == null)
         {
             documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory());
         }
         else
         {
             documentPreprocessor.SetTokenizerFactory(tokenizerFactory);
         }
         //Setup the output
         PrintWriter pwo = pwOut;
         if (op.testOptions.writeOutputFiles)
         {
             string normalizedName = filename;
             try
             {
                 new URL(normalizedName);
                 // this will exception if not a URL
                 normalizedName = normalizedName.ReplaceAll("/", "_");
             }
             catch (MalformedURLException)
             {
             }
             //It isn't a URL, so silently ignore
             string ext   = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
             string fname = normalizedName + '.' + ext;
             if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty())
             {
                 string fseparator = Runtime.GetProperty("file.separator");
                 if (fseparator == null || fseparator.IsEmpty())
                 {
                     fseparator = "/";
                 }
                 File fnameFile = new File(fname);
                 fname          = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName();
             }
             try
             {
                 pwo = op.tlpParams.Pw(new FileOutputStream(fname));
             }
             catch (IOException ioe)
             {
                 throw new RuntimeIOException(ioe);
             }
         }
         treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding());
         pwErr.Println("Parsing file: " + filename);
         int num          = 0;
         int numProcessed = 0;
         if (op.testOptions.testingThreads != 1)
         {
             MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     IParserQuery pq = wrapper.Poll();
                     ProcessResults(pq, numProcessed++, pwo);
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 IParserQuery pq = wrapper.Poll();
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         else
         {
             IParserQuery pq = pqFactory.ParserQuery();
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 pq.ParseAndReport(sentence, pwErr);
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         treePrint.PrintFooter(pwo);
         if (op.testOptions.writeOutputFiles)
         {
             pwo.Close();
         }
         pwErr.Println("Parsed file: " + filename + " [" + num + " sentences].");
     }
      long millis = timer.Stop();
      if (summary)
     {
         if (pcfgLL != null)
         {
             pcfgLL.Display(false, pwErr);
         }
         if (depLL != null)
         {
             depLL.Display(false, pwErr);
         }
         if (factLL != null)
         {
             factLL.Display(false, pwErr);
         }
     }
      if (saidMemMessage)
     {
         ParserUtils.PrintOutOfMemory(pwErr);
     }
      double wordspersec = numWords / (((double)millis) / 1000);
      double sentspersec = numSents / (((double)millis) / 1000);
      NumberFormat nf    = new DecimalFormat("0.00");
      // easier way!
      pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec).");
      if (numFallback > 0)
     {
         pwErr.Println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
     }
      if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0)
     {
         pwErr.Println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
         if (numUnparsable > 0)
         {
             pwErr.Println("    " + numUnparsable + " were not parsable with non-zero probability.");
         }
         if (numNoMemory > 0)
         {
             pwErr.Println("    " + numNoMemory + " were skipped because of insufficient memory.");
         }
         if (numSkipped > 0)
         {
             pwErr.Println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
         }
     }
     }

Example #15

0

Show file

File: BuildBinarizedDataset.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <summary>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// </summary>
        /// <remarks>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// <br />
        /// The expected input file is one sentence per line, with sentences
        /// separated by blank lines. The first line has the main label of the sentence together with the full sentence.
        /// Lines after the first sentence line but before
        /// the blank line will be treated as labeled sub-phrases.  The
        /// labels should start with the label and then contain a list of
        /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
        /// For example:
        /// <br />
        /// <code>
        /// 1 Today is not a good day.<br />
        /// 3 good<br />
        /// 3 good day <br />
        /// 3 a good day <br />
        /// <br />
        /// (next block starts here) <br />
        /// </code>
        /// By default the englishPCFG parser is used.  This can be changed
        /// with the
        /// <c>-parserModel</c>
        /// flag.  Specify an input file
        /// with
        /// <c>-input</c>
        /// .
        /// <br />
        /// If a sentiment model is provided with -sentimentModel, that model
        /// will be used to prelabel the sentences.  Any spans with given
        /// labels will then be used to adjust those labels.
        /// </remarks>
        public static void Main(string[] args)
        {
            CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
            string         parserModel           = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
            string         inputPath             = null;
            string         sentimentModelPath    = null;
            SentimentModel sentimentModel        = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                {
                    inputPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                        {
                            sentimentModelPath = args[argIndex + 1];
                            argIndex          += 2;
                        }
                        else
                        {
                            log.Info("Unknown argument " + args[argIndex]);
                            System.Environment.Exit(2);
                        }
                    }
                }
            }
            if (inputPath == null)
            {
                throw new ArgumentException("Must specify input file with -input");
            }
            LexicalizedParser parser    = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel));
            TreeBinarizer     binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());

            if (sentimentModelPath != null)
            {
                sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath);
            }
            string text = IOUtils.SlurpFileNoExceptions(inputPath);

            string[] chunks = text.Split("\\n\\s*\\n+");
            // need blank line to make a new chunk
            foreach (string chunk in chunks)
            {
                if (chunk.Trim().IsEmpty())
                {
                    continue;
                }
                // The expected format is that line 0 will be the text of the
                // sentence, and each subsequence line, if any, will be a value
                // followed by the sequence of tokens that get that value.
                // Here we take the first line and tokenize it as one sentence.
                string[]             lines    = chunk.Trim().Split("\\n");
                string               sentence = lines[0];
                StringReader         sin      = new StringReader(sentence);
                DocumentPreprocessor document = new DocumentPreprocessor(sin);
                document.SetSentenceFinalPuncWords(new string[] { "\n" });
                IList <IHasWord> tokens = document.GetEnumerator().Current;
                int mainLabel           = System.Convert.ToInt32(tokens[0].Word());
                //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
                tokens = tokens.SubList(1, tokens.Count);
                //log.info(tokens);
                IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap();
                for (int i = 1; i < lines.Length; ++i)
                {
                    ExtractLabels(spanToLabels, tokens, lines[i]);
                }
                // TODO: add an option which treats the spans as constraints when parsing
                Tree tree           = parser.Apply(tokens);
                Tree binarized      = binarizer.TransformTree(tree);
                Tree collapsedUnary = transformer.TransformTree(binarized);
                // if there is a sentiment model for use in prelabeling, we
                // label here and then use the user given labels to adjust
                if (sentimentModel != null)
                {
                    Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                    SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                    scorer.ForwardPropagateTree(collapsedUnary);
                    SetPredictedLabels(collapsedUnary);
                }
                else
                {
                    SetUnknownLabels(collapsedUnary, mainLabel);
                }
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                collapsedUnary.IndexSpans();
                foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels)
                {
                    SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value);
                }
                System.Console.Out.WriteLine(collapsedUnary);
            }
        }

Example #16

0

Show file

File: Analyzer.cs Project: priscian/rhetorica

        public Analyzer(string path, DocumentPreprocessor.DocType docType = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None)
            : this()
        {
            _path = path;
              _docType = docType;
              _ignore = ignore;
              _options = options;

              _punctuation = punctuation ?? PunctuationPatterns;

              Open();
        }

C# (CSharp) DocumentPreprocessor Examples