Пример #1
0
        public string SplitSentences(Models.Text doc)
        {
            List <Models.Text> toReturn = new List <Models.Text>();
            string             fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                Models.Text sentenceObject = new Text();
                sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation));
                toReturn.Add(sentenceObject);
            }
            return(JsonConvert.SerializeObject(toReturn));
        }
Пример #2
0
        public string SuggestEntityMentions(Models.Text doc)
        {
            string fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap>         entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation)));
            List <Bean.Annotation> annotations    = new List <Bean.Annotation>();

            foreach (CoreMap entityMention in entityMentions)
            {
                Bean.Annotation annotation = new Bean.Annotation();
                annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue();
                annotation.end   = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue();
                annotation.type  = (string)entityMention.get(typeof(NamedEntityTagAnnotation));
                annotations.Add(annotation);
            }
            return(JsonConvert.SerializeObject(annotations));
        }
Пример #3
0
        public string SpellCorrect(Models.Text doc)
        {
            string fulltext = doc.RawText;

            // These next two lines really should not be done per call.  They should be moved to startup
            var distance         = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin");
            var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin");

            // Here, we manipulate fulltext if there are spelling errors present
            // then we return the edited text

            // reconstruct it maybe?
            string correctedText = "";

            // fetch tokenization for the document as we are correcting individual words
            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation))))
                {
                    // we have to look this token up in both normal word space as well as spelling word space
                    // at that point, we would do the mathematics to compute the resultant word vector

                    /*You have something like:
                     *
                     * [reliable] - [relieable] + [foriegn] ==> [foreign]
                     * To generalise this approach(make it less reliant on reliable…),
                     * we can build a spelling transformation vector by taking the average
                     * difference between a set of pairs of correct and incorrectly spelled words.
                     * We can then fix a spelling mistake by subtracting this spelling transformation
                     * vector from the incorrectly spelled word vector and finding the word closest
                     * to where we end up.*/

                    BestWord[] bestwords         = distance.Search(token.word());
                    BestWord[] spellingBestwords = spellingDistance.Search(token.word());

                    if (bestwords.Length == 0)
                    {
                        string correction = token.word();

                        // we assume there might be a spelling mistake
                        if (spellingBestwords.Length != 0)
                        {
                            correction = spellingBestwords[0].Word;
                        }

                        // We have to make a proper decision on the next line
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + correction;
                    }
                    else
                    {
                        // we assume that this is spelled right since our main vector knows of it

                        // this is really not the correct way to construct the doucment because space is not
                        // always the appropriate whitespace.
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + token.word();
                    }
                }
            }

            return(correctedText);
        }
Пример #4
0
        private bool TransformAnnotationDocument(Models.Document doc)
        {
            string text = doc.RawText;
            string type = doc.Type;

            string user             = "******";
            string todayString      = DateTime.Today.ToString("MMddyyyy");
            string originalFilename = doc.FileName;

            //*****************************************************************************

            // Here we write to file the chosen DocumentSentiment which has one format
            string sentiment            = doc.DocumentSentiment;
            var    newSentimentFilename = Path.ChangeExtension(originalFilename, ".snt");

            try
            {
                string filePath = null;
                if (ConfigurationManager.AppSettings["environment"] == Debug)
                {
                    filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename);
                }
                else if (ConfigurationManager.AppSettings["environment"] == Release)
                {
                    filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename;
                }
                System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                file.Directory.Create();
                using (StreamWriter sentFile = new StreamWriter(file.FullName, false))
                {
                    sentFile.WriteLine(sentiment);
                }
            }
            catch (Exception e)
            {
                // Don't know what to do in this case
            }

            //*****************************************************************************

            // Here we write to file the chosen Sentence-level sentiment which has different format
            List <string> senSentiment = doc.SentenceSentiment;
            List <string> docSentences = doc.Sentences;
            var           newSentenceSentimentFilename = Path.ChangeExtension(originalFilename, ".csv");

            try
            {
                string filePath = null;
                if (ConfigurationManager.AppSettings["environment"] == Debug)
                {
                    filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename);
                }
                else if (ConfigurationManager.AppSettings["environment"] == Release)
                {
                    filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename;
                }
                System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                file.Directory.Create();

                using (StreamWriter sentFile = new StreamWriter(file.FullName, false))
                {
                    var writer = new CsvWriter(sentFile);
                    writer.Configuration.Delimiter = ",";

                    // Write the header
                    writer.WriteField("Sentiment");
                    writer.WriteField("Sentence");
                    writer.NextRecord();

                    for (int sen = 0; sen < senSentiment.Count; sen++)
                    {
                        var sentence = docSentences[sen];
                        var senSen   = senSentiment[sen];
                        if (senSen == null)
                        {
                            writer.WriteField("Unknown");
                        }
                        else
                        {
                            writer.WriteField(senSen);
                        }
                        writer.WriteField(sentence);
                        writer.NextRecord();
                    }
                }
            }
            catch (Exception e)
            {
                // Don't know what to do in this case
            }

            //*****************************************************************************

            // Process the user entered annotations

            string            annotations       = doc.Annotations == null || doc.Annotations == "" ? "" : doc.Annotations;
            List <Annotation> clientAnnotations = JsonConvert.DeserializeObject <List <Annotation> >(annotations);

            if (clientAnnotations != null)
            {
                clientAnnotations.Sort(delegate(Annotation ca1, Annotation ca2)
                {
                    return(ca1.begin.CompareTo(ca2.begin));
                });
            }

            // Here we write to file with the chosen annotation type

            if (type == "default")
            {
                var newFilename          = Path.ChangeExtension(originalFilename, ".ann");
                List <EntityMention> ems = new List <EntityMention>();
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        EntityMention em = new EntityMention();
                        em.begin = clientAnnotation.begin;
                        em.end   = clientAnnotation.end;
                        em.type  = clientAnnotation.type;
                        em.text  = text.Substring(clientAnnotation.begin, clientAnnotation.end - clientAnnotation.begin);
                        ems.Add(em);
                    }
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter annFile = new StreamWriter(file.FullName, false))
                    {
                        annFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        annFile.WriteLine("###FORMAT: " + type + " ###");
                        foreach (EntityMention em in ems)
                        {
                            annFile.WriteLine(em);
                        }
                    }
                    return(true);
                }
                catch (Exception e)
                {
                    return(false);
                }
            }
            else if (type == "xml")
            {
                var    newFilename     = Path.ChangeExtension(originalFilename, ".xml");
                string fulltext        = "";
                int    currentLocation = 0;
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        int    begin      = clientAnnotation.begin;
                        int    end        = clientAnnotation.end;
                        string entityType = clientAnnotation.type;
                        fulltext       += text.Substring(currentLocation, begin - currentLocation);
                        fulltext       += "<" + entityType + ">";
                        fulltext       += text.Substring(begin, end - begin);
                        fulltext       += "</" + entityType + ">";
                        currentLocation = end;
                    }
                    fulltext += text.Substring(currentLocation);
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter xmlFile = new StreamWriter(file.FullName, false))
                    {
                        xmlFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        xmlFile.WriteLine("###FORMAT: " + type + " ###");
                        xmlFile.WriteLine(fulltext);
                    }
                }
                catch (Exception e)
                {
                    return(false);
                }
                return(true);
            }
            else if (type == "stanford")
            {
                var        newFilename            = Path.ChangeExtension(originalFilename, ".conll");
                string     fulltext               = "";
                int        clientAnnotationNumber = 0;
                int        clientAnnotationSize   = 0;
                Annotation clientAnnotation       = null;
                int        clientAnnotationBegin  = Int32.MaxValue;
                int        clientAnnotationEnd    = Int32.MaxValue;
                string     clientAnnotationType   = "";
                if (clientAnnotations != null && clientAnnotations.Count > 0)
                {
                    clientAnnotationSize  = clientAnnotations.Count;
                    clientAnnotation      = clientAnnotations[0];
                    clientAnnotationBegin = clientAnnotation.begin;
                    clientAnnotationEnd   = clientAnnotation.end;
                    clientAnnotationType  = clientAnnotation.type;
                }
                edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(text);
                PipelineDispenser.StanfordPipeline.annotate(document);
                List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));
                foreach (CoreMap sentence in sentences)
                {
                    List <CoreLabel> tokens = JavaExtensions.ToList <CoreLabel>((java.util.List)document.get(typeof(TokensAnnotation)));
                    foreach (CoreLabel token in tokens)
                    {
                        int    tokenBegin = token.beginPosition();
                        int    tokenEnd   = token.endPosition();
                        string chosenNer  = "O";
                        if (isContainedIn(tokenBegin, tokenEnd, clientAnnotationBegin, clientAnnotationEnd))
                        {
                            chosenNer = clientAnnotationType;
                            if (tokenEnd == clientAnnotationEnd)
                            {
                                clientAnnotationNumber++;
                                if (clientAnnotationNumber < clientAnnotationSize)
                                {
                                    clientAnnotation      = clientAnnotations[clientAnnotationNumber];
                                    clientAnnotationBegin = clientAnnotation.begin;
                                    clientAnnotationEnd   = clientAnnotation.end;
                                    clientAnnotationType  = clientAnnotation.type;
                                }
                            }
                        }
                        fulltext += (token.value() + " " + chosenNer + Environment.NewLine);
                    }
                    fulltext += Environment.NewLine;
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter conllFile = new StreamWriter(file.FullName, false))
                    {
                        conllFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        conllFile.WriteLine("###FORMAT: " + type + " ###");
                        conllFile.WriteLine(fulltext);
                    }
                }
                catch (Exception e)
                {
                    return(false);
                }
                return(true);
            }
            else if (type == "luis")
            {
                var    newFilename = Path.ChangeExtension(originalFilename, ".lou");
                string fulltext    = "";
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        EntityMention em = new EntityMention();
                        em.begin  = clientAnnotation.begin;
                        em.end    = clientAnnotation.end - 1;
                        em.type   = clientAnnotation.type;
                        fulltext += (
                            "{" +
                            "\"entity\": \"" + em.type
                            + "\", \"startPos\": " + em.begin
                            + ", \"endPos\": " + em.end
                            + "}," + "\n"
                            );
                    }
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter annFile = new StreamWriter(file.FullName, false))
                    {
                        annFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        annFile.WriteLine("###FORMAT: " + type + " ###");
                        annFile.WriteLine(fulltext);
                    }
                    return(true);
                }
                catch (Exception e)
                {
                    return(false);
                }
            }
            else
            {
                return(false);
            }
        }