コード例 #1
0
        private static Edu.Stanford.Nlp.Classify.RVFDataset <string, string> ReadSVMLightFormat(string filename, IIndex <string> featureIndex, IIndex <string> labelIndex, IList <string> lines)
        {
            BufferedReader @in = null;

            Edu.Stanford.Nlp.Classify.RVFDataset <string, string> dataset;
            try
            {
                dataset = new Edu.Stanford.Nlp.Classify.RVFDataset <string, string>(10, featureIndex, labelIndex);
                @in     = IOUtils.ReaderFromString(filename);
                while (@in.Ready())
                {
                    string line = @in.ReadLine();
                    if (lines != null)
                    {
                        lines.Add(line);
                    }
                    dataset.Add(SvmLightLineToRVFDatum(line));
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(@in);
            }
            return(dataset);
        }
コード例 #2
0
        /// <summary>
        /// The format of each line of this file is
        /// fullStateName ( TAB  abbrev )
        /// The file is cased and checked cased.
        /// </summary>
        /// <remarks>
        /// The format of each line of this file is
        /// fullStateName ( TAB  abbrev )
        /// The file is cased and checked cased.
        /// The result is: statesAbbreviation is a hash from each abbrev to the fullStateName.
        /// </remarks>
        public virtual void LoadStateAbbreviation(string statesFile)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(statesFile);
                for (string line; (line = reader.ReadLine()) != null;)
                {
                    string[] tokens = line.Split("\t");
                    foreach (string token in tokens)
                    {
                        statesAbbreviation[token] = tokens[0];
                    }
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
コード例 #3
0
        // give up
        /// <summary>This assumes each line is of the form (number=value) and it adds each value in order of the lines in the file.</summary>
        /// <remarks>
        /// This assumes each line is of the form (number=value) and it adds each value in order of the lines in the file.
        /// Warning: This ignores the value of number, and just indexes each value it encounters in turn!
        /// </remarks>
        /// <param name="file">Which file to load</param>
        /// <returns>An index built out of the lines in the file</returns>
        public static IIndex <string> LoadFromFilename(string file)
        {
            IIndex <string> index = new Edu.Stanford.Nlp.Util.HashIndex <string>();
            BufferedReader  br    = null;

            try
            {
                br = IOUtils.ReaderFromString(file);
                for (string line; (line = br.ReadLine()) != null;)
                {
                    int start = line.IndexOf('=');
                    if (start == -1 || start == line.Length - 1)
                    {
                        continue;
                    }
                    index.Add(Sharpen.Runtime.Substring(line, start + 1));
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(br);
            }
            return(index);
        }
コード例 #4
0
        /// <summary>
        /// The format of the demonyms file is
        /// countryCityOrState ( TAB demonym )
        /// Lines starting with # are ignored
        /// The file is cased but stored in in-memory data structures uncased.
        /// </summary>
        /// <remarks>
        /// The format of the demonyms file is
        /// countryCityOrState ( TAB demonym )
        /// Lines starting with # are ignored
        /// The file is cased but stored in in-memory data structures uncased.
        /// The results are:
        /// demonyms is a hash from each country (etc.) to a set of demonymic Strings;
        /// adjectiveNation is a set of demonymic Strings;
        /// demonymSet has all country (etc.) names and all demonymic Strings.
        /// </remarks>
        private void LoadDemonymLists(string demonymFile)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(demonymFile);
                for (string line; (line = reader.ReadLine()) != null;)
                {
                    line = line.ToLower(Locale.English);
                    string[] tokens = line.Split("\t");
                    if (tokens[0].StartsWith("#"))
                    {
                        continue;
                    }
                    ICollection <string> set = Generics.NewHashSet();
                    foreach (string s in tokens)
                    {
                        set.Add(s);
                        demonymSet.Add(s);
                    }
                    demonyms[tokens[0]] = set;
                }
                Sharpen.Collections.AddAll(adjectiveNation, demonymSet);
                adjectiveNation.RemoveAll(demonyms.Keys);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
コード例 #5
0
 private static void LoadCorefDict(string[] file, List <ICounter <Pair <string, string> > > dict)
 {
     for (int i = 0; i < 4; i++)
     {
         dict.Add(new ClassicCounter <Pair <string, string> >());
         BufferedReader reader = null;
         try
         {
             reader = IOUtils.ReaderFromString(file[i]);
             // Skip the first line (header)
             reader.ReadLine();
             while (reader.Ready())
             {
                 string[] split = reader.ReadLine().Split("\t");
                 dict[i].SetCount(new Pair <string, string>(split[0], split[1]), double.ParseDouble(split[2]));
             }
         }
         catch (IOException e)
         {
             throw new Exception(e);
         }
         finally
         {
             IOUtils.CloseIgnoringExceptions(reader);
         }
     }
 }
コード例 #6
0
        private static void LoadSignatures(string file, IDictionary <string, ICounter <string> > sigs)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(file);
                while (reader.Ready())
                {
                    string[]          split = reader.ReadLine().Split("\t");
                    ICounter <string> cntr  = new ClassicCounter <string>();
                    sigs[split[0]] = cntr;
                    for (int i = 1; i < split.Length; i = i + 2)
                    {
                        cntr.SetCount(split[i], double.ParseDouble(split[i + 1]));
                    }
                }
            }
            catch (IOException e)
            {
                throw new Exception(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
コード例 #7
0
        /* HashMap of singleton instances */
        /// <summary>Sets up dictionary of valid verbs and their POS info from an input file.</summary>
        /// <remarks>
        /// Sets up dictionary of valid verbs and their POS info from an input file.
        /// The input file must be a list of whitespace-separated verb-lemma-POS triples, one verb
        /// form per line.
        /// </remarks>
        /// <param name="dictPath">the path to the dictionary file</param>
        private static Dictionary <string, string> SetupDictionary(string dictPath)
        {
            Dictionary <string, string> dictionary = new Dictionary <string, string>();
            BufferedReader br = null;

            try
            {
                br = IOUtils.ReaderFromString(dictPath);
                for (string line; (line = br.ReadLine()) != null;)
                {
                    string[] words = line.Trim().Split("\\s");
                    if (words.Length < 3)
                    {
                        System.Console.Error.Printf("SpanishVerbStripper: adding words to dict, missing fields, ignoring line: %s%n", line);
                    }
                    else
                    {
                        dictionary[words[0]] = words[2];
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException)
            {
                log.Info("Could not load Spanish data file " + dictPath);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(br);
            }
            return(dictionary);
        }
コード例 #8
0
        /// <summary>Static method for getting an NERClassifierCombiner from a string path.</summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.InvalidCastException"/>
        public static ClassifierCombiner GetClassifier(string loadPath, Properties props)
        {
            ObjectInputStream     ois       = IOUtils.ReadStreamFromString(loadPath);
            NERClassifierCombiner returnNCC = ((NERClassifierCombiner)GetClassifier(ois, props));

            IOUtils.CloseIgnoringExceptions(ois);
            return(returnNCC);
        }
コード例 #9
0
        /// <exception cref="System.IO.IOException"/>
        public virtual Annotation CreateFromFile(string filename)
        {
            InputStream stream = new BufferedInputStream(new FileInputStream(filename));
            Annotation  anno   = Create(stream);

            IOUtils.CloseIgnoringExceptions(stream);
            return(anno);
        }
コード例 #10
0
        // static method for getting a ClassifierCombiner from a string path
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.InvalidCastException"/>
        public static Edu.Stanford.Nlp.IE.ClassifierCombiner GetClassifier(string loadPath, Properties props)
        {
            ObjectInputStream ois = IOUtils.ReadStreamFromString(loadPath);

            Edu.Stanford.Nlp.IE.ClassifierCombiner returnCC = GetClassifier(ois, props);
            IOUtils.CloseIgnoringExceptions(ois);
            return(returnCC);
        }
コード例 #11
0
        /// <exception cref="System.IO.IOException"/>
        public virtual Annotation CreateFromFile(string filename)
        {
            Reader     r    = IOUtils.GetBufferedFileReader(filename);
            Annotation anno = Create(r);

            IOUtils.CloseIgnoringExceptions(r);
            return(anno);
        }
コード例 #12
0
        /// <exception cref="System.IO.IOException"/>
        private static void Tok(IList <string> inputFileList, IList <string> outputFileList, string charset, Pattern parseInsidePattern, Pattern filterPattern, string options, bool preserveLines, bool oneLinePerElement, bool dump, bool lowerCase)
        {
            long start     = Runtime.NanoTime();
            long numTokens = 0;
            int  numFiles  = inputFileList.Count;

            if (numFiles == 0)
            {
                Reader         stdin  = IOUtils.ReaderFromStdin(charset);
                BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.Console.Out, charset));
                numTokens += TokReader(stdin, writer, parseInsidePattern, filterPattern, options, preserveLines, oneLinePerElement, dump, lowerCase);
                IOUtils.CloseIgnoringExceptions(writer);
            }
            else
            {
                BufferedWriter @out = null;
                if (outputFileList == null)
                {
                    @out = new BufferedWriter(new OutputStreamWriter(System.Console.Out, charset));
                }
                for (int j = 0; j < numFiles; j++)
                {
                    using (Reader r = IOUtils.ReaderFromString(inputFileList[j], charset))
                    {
                        if (outputFileList != null)
                        {
                            @out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList[j]), charset));
                        }
                        numTokens += TokReader(r, @out, parseInsidePattern, filterPattern, options, preserveLines, oneLinePerElement, dump, lowerCase);
                    }
                    if (outputFileList != null)
                    {
                        IOUtils.CloseIgnoringExceptions(@out);
                    }
                }
                // end for j going through inputFileList
                if (outputFileList == null)
                {
                    IOUtils.CloseIgnoringExceptions(@out);
                }
            }
            long   duration    = Runtime.NanoTime() - start;
            double wordsPerSec = (double)numTokens / ((double)duration / 1000000000.0);

            System.Console.Error.Printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
        }
コード例 #13
0
        public static Annotation ReadSerializedProtobufFile(File fileIn)
        {
            Annotation annotation;

            try
            {
                ProtobufAnnotationSerializer pas = new ProtobufAnnotationSerializer();
                InputStream @is = new BufferedInputStream(new FileInputStream(fileIn));
                Pair <Annotation, InputStream> pair = pas.Read(@is);
                pair.second.Close();
                annotation = pair.first;
                IOUtils.CloseIgnoringExceptions(@is);
                return(annotation);
            }
            catch (Exception e)
            {
                throw new Exception(e);
            }
        }
コード例 #14
0
        /// <summary>
        /// Read XML from the specified file and write XML to stdout,
        /// while transforming text appearing inside the specified XML
        /// tags by applying the specified
        /// <see cref="Java.Util.Function.IFunction{T, R}"><code>Function</code></see>
        /// .  Note that the <code>Function</code>
        /// you supply must be prepared to accept <code>String</code>s as
        /// input; if your <code>Function</code> doesn't handle
        /// <code>String</code>s, you need to write a wrapper for it that
        /// does.
        /// </summary>
        /// <param name="tags">
        /// an array of <code>String</code>s, each an XML tag
        /// within which the transformation should be applied
        /// </param>
        /// <param name="fn">
        /// the
        /// <see cref="Java.Util.Function.IFunction{T, R}"><code>Function</code></see>
        /// to apply
        /// </param>
        /// <param name="in">the <code>File</code> to read from</param>
        public virtual void TransformXML(string[] tags, IFunction <string, T> fn, File @in)
        {
            InputStream ins = null;

            try
            {
                ins = new BufferedInputStream(new FileInputStream(@in));
                TransformXML(tags, fn, ins, System.Console.Out);
            }
            catch (Exception e)
            {
                log.Info("Error reading file " + @in + ": " + e);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(ins);
            }
        }
コード例 #15
0
        // write a ClassifierCombiner to disk, this is based on CRFClassifier code
        public override void SerializeClassifier(string serializePath)
        {
            log.Info("Serializing classifier to " + serializePath + "...");
            ObjectOutputStream oos = null;

            try
            {
                oos = IOUtils.WriteStreamFromString(serializePath);
                SerializeClassifier(oos);
                log.Info("done.");
            }
            catch (Exception e)
            {
                throw new RuntimeIOException("Failed to save classifier", e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(oos);
            }
        }
コード例 #16
0
        /// <exception cref="System.IO.IOException"/>
        private static void CheckLineIterable(bool includeEol)
        {
            string[]             expected   = new string[] { "abcdefhij\r\n", "klnm\r\n", "opqrst\n", "uvwxyz\r", "I am a longer line than the rest\n", "12345" };
            string               testString = StringUtils.Join(expected, string.Empty);
            Reader               reader     = new StringReader(testString);
            int                  i          = 0;
            IEnumerable <string> iterable   = IOUtils.GetLineIterable(reader, 10, includeEol);

            foreach (string line in iterable)
            {
                string expLine = expected[i];
                if (!includeEol)
                {
                    expLine = expLine.ReplaceAll("\\r|\\n", string.Empty);
                }
                NUnit.Framework.Assert.AreEqual("Checking line " + i, expLine, line);
                i++;
            }
            NUnit.Framework.Assert.AreEqual("Check got all lines", expected.Length, i);
            IOUtils.CloseIgnoringExceptions(reader);
        }
コード例 #17
0
        /// <summary>
        /// Read XML from the specified file and write XML to specified file,
        /// while transforming text appearing inside the specified XML tags
        /// by applying the specified
        /// <see cref="Java.Util.Function.Func{T, R}"><code>Function</code></see>
        /// .
        /// Note that the <code>Function</code> you supply must be
        /// prepared to accept <code>String</code>s as input; if your
        /// <code>Function</code> doesn't handle <code>String</code>s, you
        /// need to write a wrapper for it that does.
        /// </summary>
        /// <param name="tags">
        /// an array of <code>String</code>s, each an XML tag
        /// within which the transformation should be applied
        /// </param>
        /// <param name="fn">
        /// the
        /// <see cref="Java.Util.Function.Func{T, R}"><code>Function</code></see>
        /// to apply
        /// </param>
        /// <param name="in">the <code>File</code> to read from</param>
        /// <param name="out">the <code>File</code> to write to</param>
        public virtual void TransformXML(string[] tags, Func <string, T> fn, File @in, File @out)
        {
            InputStream  ins  = null;
            OutputStream outs = null;

            try
            {
                ins  = new BufferedInputStream(new FileInputStream(@in));
                outs = new BufferedOutputStream(new FileOutputStream(@out));
                TransformXML(tags, fn, ins, outs);
            }
            catch (Exception e)
            {
                log.Info("Error reading file " + @in + " or writing file " + @out + ": " + e);
                Sharpen.Runtime.PrintStackTrace(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(ins);
                IOUtils.CloseIgnoringExceptions(outs);
            }
        }
コード例 #18
0
 private void PrimeNext()
 {
     do
     {
         // It is necessary to loop because if a document has a pattern
         // that goes: <tag></tag> the xmlItr will return an empty
         // string, which the plainItr will process to null.  If we
         // didn't loop to find the next tag, the iterator would stop.
         if (this.plainItr != null && this.plainItr.MoveNext())
         {
             this.nextSent = this.plainItr.Current;
         }
         else
         {
             if (this.xmlItr.MoveNext())
             {
                 string block = this.xmlItr.Current;
                 this._enclosing.inputReader = new BufferedReader(new StringReader(block));
                 this.plainItr = new DocumentPreprocessor.PlainTextIterator(this);
                 if (this.plainItr.MoveNext())
                 {
                     this.nextSent = this.plainItr.Current;
                 }
                 else
                 {
                     this.nextSent = null;
                 }
             }
             else
             {
                 IOUtils.CloseIgnoringExceptions(this.originalDocReader);
                 this.nextSent = null;
                 break;
             }
         }
     }while (this.nextSent == null);
 }
コード例 #19
0
        private static void LoadCorefDictPMI(string file, ICounter <Pair <string, string> > dict)
        {
            BufferedReader reader = null;

            try
            {
                reader = IOUtils.ReaderFromString(file);
                // Skip the first line (header)
                reader.ReadLine();
                while (reader.Ready())
                {
                    string[] split = reader.ReadLine().Split("\t");
                    dict.SetCount(new Pair <string, string>(split[0], split[1]), double.ParseDouble(split[3]));
                }
            }
            catch (IOException e)
            {
                throw new Exception(e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(reader);
            }
        }
コード例 #20
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            // Strips off hyphens
            Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs());

            if (options.Contains("help") || args.Length == 0)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            int nThreads = PropertiesUtils.GetInt(options, "nthreads", 1);

            Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter = GetSegmenter(options);
            // Decode either an evaluation file or raw text
            try
            {
                PrintWriter pwOut;
                if (segmenter.flags.outputEncoding != null)
                {
                    OutputStreamWriter @out = new OutputStreamWriter(System.Console.Out, segmenter.flags.outputEncoding);
                    pwOut = new PrintWriter(@out, true);
                }
                else
                {
                    if (segmenter.flags.inputEncoding != null)
                    {
                        OutputStreamWriter @out = new OutputStreamWriter(System.Console.Out, segmenter.flags.inputEncoding);
                        pwOut = new PrintWriter(@out, true);
                    }
                    else
                    {
                        pwOut = new PrintWriter(System.Console.Out, true);
                    }
                }
                if (segmenter.flags.testFile != null)
                {
                    if (segmenter.flags.answerFile == null)
                    {
                        segmenter.Evaluate(pwOut);
                    }
                    else
                    {
                        Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter.EvaluateRawText(pwOut);
                    }
                }
                else
                {
                    BufferedReader br          = (segmenter.flags.textFile == null) ? IOUtils.ReaderFromStdin() : IOUtils.ReaderFromString(segmenter.flags.textFile, segmenter.flags.inputEncoding);
                    double         charsPerSec = Decode(segmenter, br, pwOut, nThreads);
                    IOUtils.CloseIgnoringExceptions(br);
                    System.Console.Error.Printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException)
            {
                System.Console.Error.Printf("%s: Could not open %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, segmenter.flags.textFile);
            }
        }
コード例 #21
0
        // static main
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            // set up optional output files
            PrintWriter @out;

            if (args.Length > 1)
            {
                @out = new PrintWriter(args[1]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            Properties props = new Properties();

            props.Load(IOUtils.ReaderFromString("StanfordCoreNLP-chinese.properties"));
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            Annotation      document;

            if (args.Length > 0)
            {
                document = new Annotation(IOUtils.SlurpFileNoExceptions(args[0]));
            }
            else
            {
                document = new Annotation("克林顿说,华盛顿将逐步落实对韩国的经济援助。金大中对克林顿的讲话报以掌声:克林顿总统在会谈中重申,他坚定地支持韩国摆脱经济危机。");
            }
            pipeline.Annotate(document);
            IList <ICoreMap> sentences = document.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int sentNo = 1;

            foreach (ICoreMap sentence in sentences)
            {
                @out.Println("Sentence #" + sentNo + " tokens are:");
                foreach (ICoreMap token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    @out.Println(token.ToShorterString("Text", "CharacterOffsetBegin", "CharacterOffsetEnd", "Index", "PartOfSpeech", "NamedEntityTag"));
                }
                @out.Println("Sentence #" + sentNo + " basic dependencies are:");
                @out.Println(sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)).ToString(SemanticGraph.OutputFormat.List));
                sentNo++;
            }
            // Access coreference.
            @out.Println("Coreference information");
            IDictionary <int, CorefChain> corefChains = document.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation));

            if (corefChains == null)
            {
                return;
            }
            foreach (KeyValuePair <int, CorefChain> entry in corefChains)
            {
                @out.Println("Chain " + entry.Key);
                foreach (CorefChain.CorefMention m in entry.Value.GetMentionsInTextualOrder())
                {
                    // We need to subtract one since the indices count from 1 but the Lists start from 0
                    IList <CoreLabel> tokens = sentences[m.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation));
                    // We subtract two for end: one for 0-based indexing, and one because we want last token of mention not one following.
                    @out.Println("  " + m + ":[" + tokens[m.startIndex - 1].BeginPosition() + ", " + tokens[m.endIndex - 2].EndPosition() + ')');
                }
            }
            IOUtils.CloseIgnoringExceptions(@out);
        }
コード例 #22
0
        /// <summary>Usage: java -cp "*" StanfordCoreNlpDemo [inputFile [outputTextFile [outputXmlFile]]]</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            // set up optional output files
            PrintWriter @out;

            if (args.Length > 1)
            {
                @out = new PrintWriter(args[1]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            PrintWriter xmlOut = null;

            if (args.Length > 2)
            {
                xmlOut = new PrintWriter(args[2]);
            }
            // Create a CoreNLP pipeline. To build the default pipeline, you can just use:
            //   StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            // Here's a more complex setup example:
            //   Properties props = new Properties();
            //   props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse");
            //   props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz");
            //   props.put("ner.applyNumericClassifiers", "false");
            //   StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            // Add in sentiment
            Properties props = new Properties();

            props.SetProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
            // Initialize an Annotation with some text to be annotated. The text is the argument to the constructor.
            Annotation annotation;

            if (args.Length > 0)
            {
                annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[0]));
            }
            else
            {
                annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply.");
            }
            // run all the selected Annotators on this text
            pipeline.Annotate(annotation);
            // this prints out the results of sentence analysis to file(s) in good formats
            pipeline.PrettyPrint(annotation, @out);
            if (xmlOut != null)
            {
                pipeline.XmlPrint(annotation, xmlOut);
            }
            // Access the Annotation in code
            // The toString() method on an Annotation just prints the text of the Annotation
            // But you can see what is in it with other methods like toShorterString()
            @out.Println();
            @out.Println("The top level annotation");
            @out.Println(annotation.ToShorterString());
            @out.Println();
            // An Annotation is a Map with Class keys for the linguistic analysis types.
            // You can get and use the various analyses individually.
            // For instance, this gets the parse tree of the first sentence in the text.
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            if (sentences != null && !sentences.IsEmpty())
            {
                ICoreMap sentence = sentences[0];
                @out.Println("The keys of the first sentence's CoreMap are:");
                @out.Println(sentence.KeySet());
                @out.Println();
                @out.Println("The first sentence is:");
                @out.Println(sentence.ToShorterString());
                @out.Println();
                @out.Println("The first sentence tokens are:");
                foreach (ICoreMap token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    @out.Println(token.ToShorterString());
                }
                Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
                @out.Println();
                @out.Println("The first sentence parse tree is:");
                tree.PennPrint(@out);
                @out.Println();
                @out.Println("The first sentence basic dependencies are:");
                @out.Println(sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)).ToString(SemanticGraph.OutputFormat.List));
                @out.Println("The first sentence collapsed, CC-processed dependencies are:");
                SemanticGraph graph = sentence.Get(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation));
                @out.Println(graph.ToString(SemanticGraph.OutputFormat.List));
                // Access coreference. In the coreference link graph,
                // each chain stores a set of mentions that co-refer with each other,
                // along with a method for getting the most representative mention.
                // Both sentence and token offsets start at 1!
                @out.Println("Coreference information");
                IDictionary <int, CorefChain> corefChains = annotation.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation));
                if (corefChains == null)
                {
                    return;
                }
                foreach (KeyValuePair <int, CorefChain> entry in corefChains)
                {
                    @out.Println("Chain " + entry.Key);
                    foreach (CorefChain.CorefMention m in entry.Value.GetMentionsInTextualOrder())
                    {
                        // We need to subtract one since the indices count from 1 but the Lists start from 0
                        IList <CoreLabel> tokens = sentences[m.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation));
                        // We subtract two for end: one for 0-based indexing, and one because we want last token of mention not one following.
                        @out.Println("  " + m + ", i.e., 0-based character offsets [" + tokens[m.startIndex - 1].BeginPosition() + ", " + tokens[m.endIndex - 2].EndPosition() + ")");
                    }
                }
                @out.Println();
                @out.Println("The first sentence overall sentiment rating is " + sentence.Get(typeof(SentimentCoreAnnotations.SentimentClass)));
            }
            IOUtils.CloseIgnoringExceptions(@out);
            IOUtils.CloseIgnoringExceptions(xmlOut);
        }
コード例 #23
0
        /// <exception cref="System.Exception"/>
        public static void RunCoref(Properties props)
        {
            /*
             * property, environment setting
             */
            Redwood.HideChannelsEverywhere("debug-cluster", "debug-mention", "debug-preprocessor", "debug-docreader", "debug-mergethres", "debug-featureselection", "debug-md");
            int    nThreads  = HybridCorefProperties.GetThreadCounts(props);
            string timeStamp = Calendar.GetInstance().GetTime().ToString().ReplaceAll("\\s", "-").ReplaceAll(":", "-");
            Logger logger    = Logger.GetLogger(typeof(Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem).FullName);

            // set log file path
            if (props.Contains(HybridCorefProperties.LogProp))
            {
                File logFile = new File(props.GetProperty(HybridCorefProperties.LogProp));
                RedwoodConfiguration.Current().Handlers(RedwoodConfiguration.Handlers.File(logFile)).Apply();
                Redwood.Log("Starting coref log");
            }
            log.Info(props.ToString());
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem cs = new Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem(props);

            /*
             * output setting
             */
            // prepare conll output
            string      goldOutput        = null;
            string      beforeCorefOutput = null;
            string      afterCorefOutput  = null;
            PrintWriter writerGold        = null;
            PrintWriter writerBeforeCoref = null;
            PrintWriter writerAfterCoref  = null;

            if (HybridCorefProperties.DoScore(props))
            {
                string pathOutput = CorefProperties.ConllOutputPath(props);
                (new File(pathOutput)).Mkdir();
                goldOutput        = pathOutput + "output-" + timeStamp + ".gold.txt";
                beforeCorefOutput = pathOutput + "output-" + timeStamp + ".predicted.txt";
                afterCorefOutput  = pathOutput + "output-" + timeStamp + ".coref.predicted.txt";
                writerGold        = new PrintWriter(new FileOutputStream(goldOutput));
                writerBeforeCoref = new PrintWriter(new FileOutputStream(beforeCorefOutput));
                writerAfterCoref  = new PrintWriter(new FileOutputStream(afterCorefOutput));
            }
            // run coref
            MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]> wrapper = new MulticoreWrapper <Pair <Document, Edu.Stanford.Nlp.Coref.Hybrid.HybridCorefSystem>, StringBuilder[]>(nThreads, new _IThreadsafeProcessor_134
                                                                                                                                                                                                                                        ());
            // conll output and logs
            DateTime startTime = null;

            if (HybridCorefProperties.CheckTime(props))
            {
                startTime = new DateTime();
                System.Console.Error.Printf("END-TO-END COREF Start time: %s\n", startTime);
            }
            // run processes
            int docCnt = 0;

            while (true)
            {
                Document document = cs.docMaker.NextDoc();
                if (document == null)
                {
                    break;
                }
                wrapper.Put(Pair.MakePair(document, cs));
                docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            }
            // Finished reading the input. Wait for jobs to finish
            wrapper.Join();
            docCnt = LogOutput(wrapper, writerGold, writerBeforeCoref, writerAfterCoref, docCnt);
            IOUtils.CloseIgnoringExceptions(writerGold);
            IOUtils.CloseIgnoringExceptions(writerBeforeCoref);
            IOUtils.CloseIgnoringExceptions(writerAfterCoref);
            if (HybridCorefProperties.CheckTime(props))
            {
                System.Console.Error.Printf("END-TO-END COREF Elapsed time: %.3f seconds\n", (((new DateTime()).GetTime() - startTime.GetTime()) / 1000F));
            }
            //      System.err.printf("CORENLP PROCESS TIME TOTAL: %.3f seconds\n", cs.mentionExtractor.corenlpProcessTime);
            if (HybridCorefProperties.CheckMemory(props))
            {
                CheckMemoryUsage();
            }
            // scoring
            if (HybridCorefProperties.DoScore(props))
            {
                string summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, beforeCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, false);
                summary = CorefScorer.GetEvalSummary(CorefProperties.GetScorerPath(props), goldOutput, afterCorefOutput);
                CorefScorer.PrintScoreSummary(summary, logger, true);
                CorefScorer.PrintFinalConllScore(summary);
            }
        }
コード例 #24
0
        /// <exception cref="Javax.Servlet.ServletException"/>
        public override void Init()
        {
            format = GetServletConfig().GetInitParameter("outputFormat");
            if (format == null || format.Trim().IsEmpty())
            {
                throw new ServletException("Invalid outputFormat setting.");
            }
            string spacingStr = GetServletConfig().GetInitParameter("preserveSpacing");

            if (spacingStr == null || spacingStr.Trim().IsEmpty())
            {
                throw new ServletException("Invalid preserveSpacing setting.");
            }
            //spacing = Boolean.valueOf(spacingStr).booleanValue();
            spacingStr = spacingStr.Trim().ToLower();
            spacing    = "true".Equals(spacingStr);
            string path = GetServletContext().GetRealPath("/WEB-INF/data/models");

            foreach (string classifier in new File(path).List())
            {
                classifiers.Add(classifier);
            }
            // TODO: get this from somewhere more interesting?
            defaultClassifier = classifiers[0];
            foreach (string classifier_1 in classifiers)
            {
                Log(classifier_1);
            }
            ners = Generics.NewHashMap();
            foreach (string classifier_2 in classifiers)
            {
                CRFClassifier model    = null;
                string        filename = "/WEB-INF/data/models/" + classifier_2;
                InputStream   @is      = GetServletConfig().GetServletContext().GetResourceAsStream(filename);
                if (@is == null)
                {
                    throw new ServletException("File not found. Filename = " + filename);
                }
                try
                {
                    if (filename.EndsWith(".gz"))
                    {
                        @is = new BufferedInputStream(new GZIPInputStream(@is));
                    }
                    else
                    {
                        @is = new BufferedInputStream(@is);
                    }
                    model = CRFClassifier.GetClassifier(@is);
                }
                catch (IOException)
                {
                    throw new ServletException("IO problem reading classifier.");
                }
                catch (InvalidCastException)
                {
                    throw new ServletException("Classifier class casting problem.");
                }
                catch (TypeLoadException)
                {
                    throw new ServletException("Classifier class not found problem.");
                }
                finally
                {
                    IOUtils.CloseIgnoringExceptions(@is);
                }
                ners[classifier_2] = model;
            }
        }
コード例 #25
0
            private void PrimeNext()
            {
                if (this._enclosing.inputReader == null)
                {
                    // we've already been out of stuff and have closed the input reader; so just return
                    return;
                }
                this.nextSent = Generics.NewArrayList(this.nextSentCarryover);
                this.nextSentCarryover.Clear();
                bool seenBoundary = false;

                if (!this.tokenizer.MoveNext())
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    // nextSent = null; // WRONG: There may be something in it from the nextSentCarryover
                    if (this.nextSent.IsEmpty())
                    {
                        this.nextSent = null;
                    }
                    return;
                }
                do
                {
                    IHasWord token = this.tokenizer.Current;
                    if (this.splitTag != null)
                    {
                        string[] toks = this.splitTag.Apply(token.Word());
                        token.SetWord(toks[0]);
                        if (token is ILabel)
                        {
                            ((ILabel)token).SetValue(toks[0]);
                        }
                        if (toks.Length == 2 && token is IHasTag)
                        {
                            //wsg2011: Some of the underlying tokenizers return old
                            //JavaNLP labels.  We could convert to CoreLabel here, but
                            //we choose a conservative implementation....
                            ((IHasTag)token).SetTag(toks[1]);
                        }
                    }
                    if (this.sentDelims.Contains(token.Word()))
                    {
                        seenBoundary = true;
                    }
                    else
                    {
                        if (seenBoundary && !this.delimFollowers.Contains(token.Word()))
                        {
                            this.nextSentCarryover.Add(token);
                            break;
                        }
                    }
                    if (!(DocumentPreprocessor.wsPattern.Matcher(token.Word()).Matches() || token.Word().Equals(PTBTokenizer.GetNewlineToken())))
                    {
                        this.nextSent.Add(token);
                    }
                    // If there are no words that can follow a sentence delimiter,
                    // then there are two cases.  In one case is we already have a
                    // sentence, in which case there is no reason to look at the
                    // next token, since that just causes buffering without any
                    // chance of the current sentence being extended, since
                    // delimFollowers = {}.  In the other case, we have an empty
                    // sentence, which at this point means the sentence delimiter
                    // was a whitespace token such as \n.  We might as well keep
                    // going as if we had never seen anything.
                    if (seenBoundary && this.delimFollowers.IsEmpty())
                    {
                        if (!this.nextSent.IsEmpty() || this._enclosing.keepEmptySentences)
                        {
                            break;
                        }
                        else
                        {
                            seenBoundary = false;
                        }
                    }
                }while (this.tokenizer.MoveNext());
                if (this.nextSent.IsEmpty() && this.nextSentCarryover.IsEmpty() && !this._enclosing.keepEmptySentences)
                {
                    IOUtils.CloseIgnoringExceptions(this._enclosing.inputReader);
                    this._enclosing.inputReader = null;
                    this.nextSent = null;
                }
                else
                {
                    if (this._enclosing.escaper != null)
                    {
                        this.nextSent = this._enclosing.escaper.Apply(this.nextSent);
                    }
                }
            }
コード例 #26
0
        /// <summary>Load a collection of parse trees from the file of given name.</summary>
        /// <remarks>
        /// Load a collection of parse trees from the file of given name.
        /// Each tree may optionally be encased in parens to allow for Penn
        /// Treebank style trees.
        /// This methods implements the <code>FileProcessor</code> interface.
        /// </remarks>
        /// <param name="file">file to load a tree from</param>
        public void ProcessFile(File file)
        {
            ITreeReader tr = null;
            // SRL stuff
            CollectionValuedMap <int, string> srlMap = null;

            if (this.srlMap != null)
            {
                // there must be a better way ...
                string filename = file.GetAbsolutePath();
                foreach (string suffix in this.srlMap.Keys)
                {
                    if (filename.EndsWith(suffix))
                    {
                        srlMap = this.srlMap[suffix];
                        break;
                    }
                }
                if (srlMap == null)
                {
                    log.Info("could not find SRL entries for file: " + file);
                }
            }
            try
            {
                // maybe print file name to stdout to get some feedback
                // could throw an IO exception if can't open for reading
                tr = TreeReaderFactory().NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), Encoding())));
                int  sentIndex = 0;
                Tree pt;
                while ((pt = tr.ReadTree()) != null)
                {
                    if (pt.Label() is IHasIndex)
                    {
                        // so we can trace where this tree came from
                        IHasIndex hi = (IHasIndex)pt.Label();
                        hi.SetDocID(file.GetName());
                        hi.SetSentIndex(sentIndex);
                    }
                    if (srlMap == null)
                    {
                        parseTrees.Add(pt);
                    }
                    else
                    {
                        ICollection <string> srls = srlMap[sentIndex];
                        //           pt.pennPrint();
                        //           log.info(srls);
                        parseTrees.Add(pt);
                        if (srls.IsEmpty())
                        {
                        }
                        else
                        {
                            //            parseTrees.add(pt);
                            foreach (string srl in srls)
                            {
                                //              Tree t = pt.deepCopy();
                                string[] bits      = srl.Split("\\s+");
                                int      verbIndex = System.Convert.ToInt32(bits[0]);
                                string   lemma     = bits[2].Split("\\.")[0];
                                //              Tree verb = Trees.getTerminal(t, verbIndex);
                                Tree verb = Edu.Stanford.Nlp.Trees.Trees.GetTerminal(pt, verbIndex);
                                //              ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL);
                                ((CoreLabel)verb.Label()).Set(typeof(CoreAnnotations.CoNLLPredicateAnnotation), true);
                                for (int i = 4; i < bits.Length; i++)
                                {
                                    string   arg = bits[i];
                                    string[] bits1;
                                    if (arg.IndexOf("ARGM") >= 0)
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    else
                                    {
                                        bits1 = arg.Split("-");
                                    }
                                    string locs    = bits1[0];
                                    string argType = bits1[1];
                                    if (argType.Equals("rel"))
                                    {
                                        continue;
                                    }
                                    foreach (string loc in locs.Split("[*,]"))
                                    {
                                        bits1 = loc.Split(":");
                                        int term   = System.Convert.ToInt32(bits1[0]);
                                        int height = System.Convert.ToInt32(bits1[1]);
                                        //                  Tree t1 = Trees.getPreTerminal(t, term);
                                        Tree t1 = Edu.Stanford.Nlp.Trees.Trees.GetPreTerminal(pt, term);
                                        for (int j = 0; j < height; j++)
                                        {
                                            //                    t1 = t1.parent(t);
                                            t1 = t1.Parent(pt);
                                        }
                                        IDictionary <int, string> roleMap = ((CoreLabel)t1.Label()).Get(typeof(CoreAnnotations.CoNLLSRLAnnotation));
                                        if (roleMap == null)
                                        {
                                            roleMap = Generics.NewHashMap();
                                            ((CoreLabel)t1.Label()).Set(typeof(CoreAnnotations.CoNLLSRLAnnotation), roleMap);
                                        }
                                        roleMap[verbIndex] = argType;
                                    }
                                }
                            }
                        }
                    }
                    //                  ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, SRL_ID.ARG);
                    //               for (Tree t1 : t) {
                    //                 if (t1.isLeaf()) { continue; }
                    //                 CoreLabel fl = (CoreLabel)t1.label();
                    //                 if (fl.value() == null) { continue; }
                    //                 if (!fl.has(SRLIDAnnotation.class)) {
                    //                   boolean allNone = true;
                    //                   for (Tree t2 : t1) {
                    //                     SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class);
                    //                     if (s == SRL_ID.ARG || s == SRL_ID.REL) {
                    //                       allNone = false;
                    //                       break;
                    //                     }
                    //                   }
                    //                   if (allNone) {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO);
                    //                   } else {
                    //                     fl.set(SRLIDAnnotation.class, SRL_ID.NO);
                    //                   }
                    //                 }
                    //               }
                    //              parseTrees.add(t);
                    sentIndex++;
                }
            }
            catch (IOException e)
            {
                throw new RuntimeIOException("MemoryTreebank.processFile IOException in file " + file, e);
            }
            finally
            {
                IOUtils.CloseIgnoringExceptions(tr);
            }
        }