Exemple #1
        public string SplitSentences(Models.Text doc)
            List <Models.Text> toReturn = new List <Models.Text>();
            string             fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
                Models.Text sentenceObject = new Text();
                sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation));
Exemple #2
        public string SuggestEntityMentions(Models.Text doc)
            string fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            List <CoreMap>         entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation)));
            List <Bean.Annotation> annotations    = new List <Bean.Annotation>();

            foreach (CoreMap entityMention in entityMentions)
                Bean.Annotation annotation = new Bean.Annotation();
                annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue();
                annotation.end   = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue();
                annotation.type  = (string)entityMention.get(typeof(NamedEntityTagAnnotation));
Exemple #3
        public string SpellCorrect(Models.Text doc)
            string fulltext = doc.RawText;

            // These next two lines really should not be done per call.  They should be moved to startup
            var distance         = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin");
            var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin");

            // Here, we manipulate fulltext if there are spelling errors present
            // then we return the edited text

            // reconstruct it maybe?
            string correctedText = "";

            // fetch tokenization for the document as we are correcting individual words
            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
                foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation))))
                    // we have to look this token up in both normal word space as well as spelling word space
                    // at that point, we would do the mathematics to compute the resultant word vector

                    /*You have something like:
                     * [reliable] - [relieable] + [foriegn] ==> [foreign]
                     * To generalise this approach(make it less reliant on reliable…),
                     * we can build a spelling transformation vector by taking the average
                     * difference between a set of pairs of correct and incorrectly spelled words.
                     * We can then fix a spelling mistake by subtracting this spelling transformation
                     * vector from the incorrectly spelled word vector and finding the word closest
                     * to where we end up.*/

                    BestWord[] bestwords         = distance.Search(token.word());
                    BestWord[] spellingBestwords = spellingDistance.Search(token.word());

                    if (bestwords.Length == 0)
                        string correction = token.word();

                        // we assume there might be a spelling mistake
                        if (spellingBestwords.Length != 0)
                            correction = spellingBestwords[0].Word;

                        // We have to make a proper decision on the next line
                        if (correctedText.Length > 0)
                            correctedText += " ";
                        correctedText = correctedText + correction;
                        // we assume that this is spelled right since our main vector knows of it

                        // this is really not the correct way to construct the doucment because space is not
                        // always the appropriate whitespace.
                        if (correctedText.Length > 0)
                            correctedText += " ";
                        correctedText = correctedText + token.word();
