예제 #1
0
        public static void Execute(string option, string text, bool disableLogging = true)
        {
            if (disableLogging)
            {
                RedwoodConfiguration.current().clear().apply();
            }
            var jarRoot = @"../../../data/paket-files/stanford-corenlp-3.9.1-models/";
            var props   = new Properties();

            props.setProperty("annotators", option);
            props.setProperty("ner.useSUTime", "0");

            // We should change current directory, so StanfordCoreNLP could find all the model files automatically
            var curDir = Environment.CurrentDirectory;

            Directory.SetCurrentDirectory(jarRoot);
            var pipeline = new StanfordNLP.StanfordCoreNLP(props);

            Directory.SetCurrentDirectory(curDir);

            // Annotation
            var annotation = new StanfordNLP.Annotation(text);

            pipeline.annotate(annotation);

            //get sentencesAnnotation to get sentences
            var sentencesAnnotation = new CoreAnnotations.SentencesAnnotation().getClass();

            //get tokensAnnotaion to get tokens in each sentence
            var tokensAnnotaion = new CoreAnnotations.TokensAnnotation().getClass();

            //get posAnnotation to get POS result of each token
            var posAnnotation = new CoreAnnotations.PartOfSpeechAnnotation().getClass();

            //get nerAnnotation to get NER result of each token
            var nerAnnotaion      = new CoreAnnotations.NamedEntityTagAnnotation().getClass();
            var deparseAnnotation = new TreeCoreAnnotations.TreeAnnotation().getClass();
            //deparseAnnotation = new TypedDependency().getClass();
            var sentences = annotation.get(sentencesAnnotation) as ArrayList;

            foreach (CoreMap sentence in sentences.toArray())
            {
                var tokens = (ArrayList)sentence.get(tokensAnnotaion);
                Console.WriteLine("Token-POS-NER: ");
                foreach (CoreLabel token in tokens)
                {
                    Console.Write($"{token.value()}-{token.get(posAnnotation)}-{token.get(nerAnnotaion)} ");
                }
                Console.WriteLine("\n\n\n");
                var parsedText = (Tree)sentence.get(deparseAnnotation);
                if (parsedText != null)
                {
                    Console.WriteLine("Parsed Text: ");
                    new TreePrint("penn,typedDependenciesCollapsed").printTree(parsedText);
                }
            }
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="phrase"></param>
        /// <returns></returns>
        public StanfordDocumentFacade Annotate(string phrase)
        {
            if (!this.Initialized)
            {
                return(null);
            }

            try {
                var annotation = new edu.stanford.nlp.pipeline.Annotation(phrase);
                _pipeLine.annotate(annotation);
                return(new StanfordDocumentFacade(annotation, phrase));
            } catch (Exception ex) {
                throw ex;
            }
        }
예제 #3
0
        public int SentiAnalysis(string text)
        {
            // Annotation
            var annotation = new edu.stanford.nlp.pipeline.Annotation(text);

            pipeline.annotate(annotation);

            using (var stream = new ByteArrayOutputStream())
            {
                pipeline.prettyPrint(annotation, new PrintWriter(stream));

                int mainSentiment = 0;
                int longest       = 0;

                String[] sentimentText = { "Very Negative", "Negative", "Neutral", "Positive", "Very Positive" };

                NumberFormat NF = new DecimalFormat("0.0000");

                var sentences = annotation.get(new CoreAnnotations.SentencesAnnotation().getClass()) as ArrayList;

                foreach (CoreMap sentence in sentences)
                {
                    Tree tree = (Tree)sentence.get(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree));


                    int sentiment = edu.stanford.nlp.neural.rnn.RNNCoreAnnotations.getPredictedClass(tree);

                    String partText = sentence.ToString();

                    try
                    {
                    }
                    catch (IndexOutOfRangeException e)
                    {
                    }
                    if (partText.Length > longest)
                    {
                        mainSentiment = sentiment;
                        longest       = partText.Length;
                    }

                    return(sentiment);
                }
            }

            return(-1);
        }
        /// <summary>
        /// Accepts a document (text) file, lets CoreNLP process (annotate) it
        /// and returns an adapter that allows iterating the output
        ///
        /// also, the text file (normalized) is returned
        /// </summary>
        /// <param name="file"></param>
        /// <returns>returns null if fails</returns>
        public StanfordDocumentFacade Annotate(FileInfo file)
        {
            if (!this.Initialized)
            {
                return(null);
            }

            try {
                string data;
                NormalizeFile(file, out data);
                var annotation = new edu.stanford.nlp.pipeline.Annotation(data);
                _pipeLine.annotate(annotation);
                return(new StanfordDocumentFacade(annotation, data));
            } catch (Exception ex) {
                throw ex;
            }
        }
예제 #5
0
        public string SplitSentences(Models.Text doc)
        {
            List <Models.Text> toReturn = new List <Models.Text>();
            string             fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                Models.Text sentenceObject = new Text();
                sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation));
                toReturn.Add(sentenceObject);
            }
            return(JsonConvert.SerializeObject(toReturn));
        }
예제 #6
0
        public string SuggestEntityMentions(Models.Text doc)
        {
            string fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap>         entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation)));
            List <Bean.Annotation> annotations    = new List <Bean.Annotation>();

            foreach (CoreMap entityMention in entityMentions)
            {
                Bean.Annotation annotation = new Bean.Annotation();
                annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue();
                annotation.end   = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue();
                annotation.type  = (string)entityMention.get(typeof(NamedEntityTagAnnotation));
                annotations.Add(annotation);
            }
            return(JsonConvert.SerializeObject(annotations));
        }
예제 #7
0
        public static String Parse_Data(String sent, StanfordCoreNLP pipeline1)
        {// extract meaningful words from user query
            // Text for processing
            var text = sent;
            // Annotation
            var annotation = new edu.stanford.nlp.pipeline.Annotation(text);

            pipeline1.annotate(annotation);
            // Result - Pretty Print
            string output;

            using (var stream = new ByteArrayOutputStream())
            {
                pipeline1.prettyPrint(annotation, new PrintWriter(stream));
                System.Console.WriteLine(" it's stanford time ");
                output = stream.toString();
                stream.close();
            }
            return(output);
        }
        public Payload RunPlugin(Payload Input)
        {
            Payload pData = new Payload();

            pData.FileID = Input.FileID;
            bool trackSegmentID = false;

            if (Input.SegmentID.Count > 0)
            {
                trackSegmentID = true;
            }
            else
            {
                pData.SegmentID = Input.SegmentID;
            }



            for (int i = 0; i < Input.StringList.Count; i++)
            {
                //seems to prematurely exit sometimes. checking to see what might cause that -- maybe blank docs?
                if (!string.IsNullOrEmpty(Input.StringList[i]) && !string.IsNullOrWhiteSpace(Input.StringList[i]))
                {
                    Annotation    annotation      = new edu.stanford.nlp.pipeline.Annotation();
                    ArrayList     sentences       = new ArrayList();
                    List <double> SentimentValues = new List <double>();

                    annotation = new edu.stanford.nlp.pipeline.Annotation(Input.StringList[i]);
                    pipeline.annotate(annotation);
                    sentences = annotation.get(new CoreAnnotations.SentencesAnnotation().getClass()) as ArrayList;


                    int SentenceCount = 0;

                    foreach (CoreMap sentence in sentences)
                    {
                        SentenceCount++;
                        Tree tree = sentence.get(new SentimentCoreAnnotations.SentimentAnnotatedTree().getClass()) as Tree;

                        //add this sentence to our overall list of sentiment scores
                        SentimentValues.Add(RNNCoreAnnotations.getPredictedClass(tree));


                        string[] OutputString_SentenceLevel = new string[10] {
                            "", "", "", "", "", "", "", "", "", ""
                        };

                        string Classification = GetClassification((double)RNNCoreAnnotations.getPredictedClass(tree));

                        //this pulls out the prediction probabilites for each class
                        string   Predictions       = RNNCoreAnnotations.getPredictionsAsStringList(tree).ToString();
                        string[] Predictions_Split = Predictions.Replace("[", "").Replace("]", "").Split(',');

                        if (useBuiltInSentenceSplitter)
                        {
                            OutputString_SentenceLevel[0] = SentenceCount.ToString();
                        }
                        else
                        {
                            //if we're using an external sentence tokenizer, then every segment is
                            //going to be treated as its own sentence.
                            OutputString_SentenceLevel[0] = (i + 1).ToString();
                        }

                        OutputString_SentenceLevel[1] = Classification;
                        OutputString_SentenceLevel[2] = RNNCoreAnnotations.getPredictedClassProb(tree.label()).ToString();
                        OutputString_SentenceLevel[3] = RNNCoreAnnotations.getPredictedClass(tree).ToString();
                        OutputString_SentenceLevel[4] = Predictions_Split[0];
                        OutputString_SentenceLevel[5] = Predictions_Split[1];
                        OutputString_SentenceLevel[6] = Predictions_Split[2];
                        OutputString_SentenceLevel[7] = Predictions_Split[3];
                        OutputString_SentenceLevel[8] = Predictions_Split[4];
                        if (includeSentenceText)
                        {
                            OutputString_SentenceLevel[9] = sentence.ToString();
                        }

                        pData.StringArrayList.Add(OutputString_SentenceLevel);
                        pData.SegmentNumber.Add(Input.SegmentNumber[i]);
                        if (trackSegmentID)
                        {
                            pData.SegmentID.Add(Input.SegmentID[i]);
                        }
                    }
                }
                else
                {
                    pData.StringArrayList.Add(new string[10] {
                        "", "", "", "", "", "", "", "", "", ""
                    });
                    pData.SegmentNumber.Add(Input.SegmentNumber[i]);
                    if (trackSegmentID)
                    {
                        pData.SegmentID.Add(Input.SegmentID[i]);
                    }
                }
            }

            return(pData);
        }
예제 #9
0
        public string SpellCorrect(Models.Text doc)
        {
            string fulltext = doc.RawText;

            // These next two lines really should not be done per call.  They should be moved to startup
            var distance         = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin");
            var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin");

            // Here, we manipulate fulltext if there are spelling errors present
            // then we return the edited text

            // reconstruct it maybe?
            string correctedText = "";

            // fetch tokenization for the document as we are correcting individual words
            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation))))
                {
                    // we have to look this token up in both normal word space as well as spelling word space
                    // at that point, we would do the mathematics to compute the resultant word vector

                    /*You have something like:
                     *
                     * [reliable] - [relieable] + [foriegn] ==> [foreign]
                     * To generalise this approach(make it less reliant on reliable…),
                     * we can build a spelling transformation vector by taking the average
                     * difference between a set of pairs of correct and incorrectly spelled words.
                     * We can then fix a spelling mistake by subtracting this spelling transformation
                     * vector from the incorrectly spelled word vector and finding the word closest
                     * to where we end up.*/

                    BestWord[] bestwords         = distance.Search(token.word());
                    BestWord[] spellingBestwords = spellingDistance.Search(token.word());

                    if (bestwords.Length == 0)
                    {
                        string correction = token.word();

                        // we assume there might be a spelling mistake
                        if (spellingBestwords.Length != 0)
                        {
                            correction = spellingBestwords[0].Word;
                        }

                        // We have to make a proper decision on the next line
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + correction;
                    }
                    else
                    {
                        // we assume that this is spelled right since our main vector knows of it

                        // this is really not the correct way to construct the doucment because space is not
                        // always the appropriate whitespace.
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + token.word();
                    }
                }
            }

            return(correctedText);
        }
예제 #10
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            DictionaryData DictData = (DictionaryData)e.Argument;


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading CoreNLP models... please wait...";
            });

            //largely taken from here: https://github.com/sergey-tihon/Stanford.NLP.NET/issues/39
            var jarRoot = @"stanford-corenlp-full-2018-02-27\";
            var props   = new java.util.Properties();

            props.setProperty("annotators", "tokenize, ssplit, parse, sentiment");
            props.setProperty("sutime.binders", "0");
            var curDir = Environment.CurrentDirectory;

            Directory.SetCurrentDirectory(Path.Combine(Path.GetDirectoryName(AppDomain.CurrentDomain.BaseDirectory), jarRoot));
            var pipeline = new StanfordCoreNLP(props);



            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(DictData.TextFileFolder, "*.txt", SearchDepth);



            //try
            //{

            //open up the output file
            using (StreamWriter outputFile = new StreamWriter(new FileStream(DictData.OutputFileLocation, FileMode.Create), SelectedEncoding))
            {
                using (StreamWriter outputFileSentences = new StreamWriter(new FileStream(AddSuffix(DictData.OutputFileLocation, "_Sentences"), FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"Sentences\",\"Classification\",\"Classification_M\",\"Classification_SD\"");

                    outputFile.WriteLine(HeaderString.ToString());

                    StringBuilder HeaderStringSentence = new StringBuilder();
                    HeaderStringSentence.Append("\"Filename\",\"SentNumber\",\"SentenceText\",\"Classification\",\"Class_Prob\",\"Class_Number\"");
                    outputFileSentences.WriteLine(HeaderStringSentence.ToString());

                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);
                        Dictionary <string, int> DictionaryResults = new Dictionary <string, int>();

                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        string InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim();



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/

                        var annotation = new edu.stanford.nlp.pipeline.Annotation(InputText);
                        pipeline.annotate(annotation);

                        List <double> SentimentValues = new List <double>();

                        var sentences = annotation.get(new CoreAnnotations.SentencesAnnotation().getClass()) as ArrayList;

                        int SentenceCount = 0;

                        foreach (CoreMap sentence in sentences)
                        {
                            SentenceCount++;
                            Tree tree = sentence.get(new SentimentCoreAnnotations.SentimentAnnotatedTree().getClass()) as Tree;

                            //add this sentence to our overall list of sentiment scores
                            SentimentValues.Add(RNNCoreAnnotations.getPredictedClass(tree));

                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|

                            string[] OutputString_SentenceLevel = new string[6];

                            string Classification = GetClassification((double)RNNCoreAnnotations.getPredictedClass(tree));


                            OutputString_SentenceLevel[0] = "\"" + Filename_Clean + "\"";
                            OutputString_SentenceLevel[1] = SentenceCount.ToString();
                            OutputString_SentenceLevel[2] = "\"" + sentence.ToString().Replace("\"", "\"\"") + "\"";
                            OutputString_SentenceLevel[3] = Classification;
                            OutputString_SentenceLevel[4] = RNNCoreAnnotations.getPredictedClassProb(tree.label()).ToString();
                            OutputString_SentenceLevel[5] = RNNCoreAnnotations.getPredictedClass(tree).ToString();

                            outputFileSentences.WriteLine(String.Join(",", OutputString_SentenceLevel));
                        }



                        //write output at the file level
                        string[] OutputString = new string[5];
                        OutputString[0] = "\"" + Filename_Clean + "\"";
                        OutputString[1] = SentenceCount.ToString();
                        OutputString[2] = GetClassification(SentimentValues.Average());
                        OutputString[3] = SentimentValues.Average().ToString();
                        OutputString[4] = StandardDeviation(SentimentValues).ToString();

                        outputFile.WriteLine(String.Join(",", OutputString));
                    }



                    //this is the closing bracket for the sentence-level "using" filestream
                }

                //this is the closing bracket for the document-level "using" filestream
            }

            //}
            //catch
            //{
            //    MessageBox.Show("Senti-Gent encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while Senti-Gent is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            //}
        }