public string SplitSentences(Models.Text doc) { List <Models.Text> toReturn = new List <Models.Text>(); string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { Models.Text sentenceObject = new Text(); sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation)); toReturn.Add(sentenceObject); } return(JsonConvert.SerializeObject(toReturn)); }
public string SuggestEntityMentions(Models.Text doc) { string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation))); List <Bean.Annotation> annotations = new List <Bean.Annotation>(); foreach (CoreMap entityMention in entityMentions) { Bean.Annotation annotation = new Bean.Annotation(); annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue(); annotation.end = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue(); annotation.type = (string)entityMention.get(typeof(NamedEntityTagAnnotation)); annotations.Add(annotation); } return(JsonConvert.SerializeObject(annotations)); }
public string SpellCorrect(Models.Text doc) { string fulltext = doc.RawText; // These next two lines really should not be done per call. They should be moved to startup var distance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin"); var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin"); // Here, we manipulate fulltext if there are spelling errors present // then we return the edited text // reconstruct it maybe? string correctedText = ""; // fetch tokenization for the document as we are correcting individual words edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation)))) { // we have to look this token up in both normal word space as well as spelling word space // at that point, we would do the mathematics to compute the resultant word vector /*You have something like: * * [reliable] - [relieable] + [foriegn] ==> [foreign] * To generalise this approach(make it less reliant on reliable…), * we can build a spelling transformation vector by taking the average * difference between a set of pairs of correct and incorrectly spelled words. * We can then fix a spelling mistake by subtracting this spelling transformation * vector from the incorrectly spelled word vector and finding the word closest * to where we end up.*/ BestWord[] bestwords = distance.Search(token.word()); BestWord[] spellingBestwords = spellingDistance.Search(token.word()); if (bestwords.Length == 0) { string correction = token.word(); // we assume there might be a spelling mistake if (spellingBestwords.Length != 0) { correction = spellingBestwords[0].Word; } // We have to make a proper decision on the next line if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + correction; } else { // we assume that this is spelled right since our main vector knows of it // this is really not the correct way to construct the doucment because space is not // always the appropriate whitespace. if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + token.word(); } } } return(correctedText); }
private bool TransformAnnotationDocument(Models.Document doc) { string text = doc.RawText; string type = doc.Type; string user = "******"; string todayString = DateTime.Today.ToString("MMddyyyy"); string originalFilename = doc.FileName; //***************************************************************************** // Here we write to file the chosen DocumentSentiment which has one format string sentiment = doc.DocumentSentiment; var newSentimentFilename = Path.ChangeExtension(originalFilename, ".snt"); try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename); } else if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter sentFile = new StreamWriter(file.FullName, false)) { sentFile.WriteLine(sentiment); } } catch (Exception e) { // Don't know what to do in this case } //***************************************************************************** // Here we write to file the chosen Sentence-level sentiment which has different format List <string> senSentiment = doc.SentenceSentiment; List <string> docSentences = doc.Sentences; var newSentenceSentimentFilename = Path.ChangeExtension(originalFilename, ".csv"); try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename); } else if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter sentFile = new StreamWriter(file.FullName, false)) { var writer = new CsvWriter(sentFile); writer.Configuration.Delimiter = ","; // Write the header writer.WriteField("Sentiment"); writer.WriteField("Sentence"); writer.NextRecord(); for (int sen = 0; sen < senSentiment.Count; sen++) { var sentence = docSentences[sen]; var senSen = senSentiment[sen]; if (senSen == null) { writer.WriteField("Unknown"); } else { writer.WriteField(senSen); } writer.WriteField(sentence); writer.NextRecord(); } } } catch (Exception e) { // Don't know what to do in this case } //***************************************************************************** // Process the user entered annotations string annotations = doc.Annotations == null || doc.Annotations == "" ? "" : doc.Annotations; List <Annotation> clientAnnotations = JsonConvert.DeserializeObject <List <Annotation> >(annotations); if (clientAnnotations != null) { clientAnnotations.Sort(delegate(Annotation ca1, Annotation ca2) { return(ca1.begin.CompareTo(ca2.begin)); }); } // Here we write to file with the chosen annotation type if (type == "default") { var newFilename = Path.ChangeExtension(originalFilename, ".ann"); List <EntityMention> ems = new List <EntityMention>(); if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { EntityMention em = new EntityMention(); em.begin = clientAnnotation.begin; em.end = clientAnnotation.end; em.type = clientAnnotation.type; em.text = text.Substring(clientAnnotation.begin, clientAnnotation.end - clientAnnotation.begin); ems.Add(em); } } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter annFile = new StreamWriter(file.FullName, false)) { annFile.WriteLine("###THIS IS A COMMENT BLOCK###"); annFile.WriteLine("###FORMAT: " + type + " ###"); foreach (EntityMention em in ems) { annFile.WriteLine(em); } } return(true); } catch (Exception e) { return(false); } } else if (type == "xml") { var newFilename = Path.ChangeExtension(originalFilename, ".xml"); string fulltext = ""; int currentLocation = 0; if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { int begin = clientAnnotation.begin; int end = clientAnnotation.end; string entityType = clientAnnotation.type; fulltext += text.Substring(currentLocation, begin - currentLocation); fulltext += "<" + entityType + ">"; fulltext += text.Substring(begin, end - begin); fulltext += "</" + entityType + ">"; currentLocation = end; } fulltext += text.Substring(currentLocation); } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter xmlFile = new StreamWriter(file.FullName, false)) { xmlFile.WriteLine("###THIS IS A COMMENT BLOCK###"); xmlFile.WriteLine("###FORMAT: " + type + " ###"); xmlFile.WriteLine(fulltext); } } catch (Exception e) { return(false); } return(true); } else if (type == "stanford") { var newFilename = Path.ChangeExtension(originalFilename, ".conll"); string fulltext = ""; int clientAnnotationNumber = 0; int clientAnnotationSize = 0; Annotation clientAnnotation = null; int clientAnnotationBegin = Int32.MaxValue; int clientAnnotationEnd = Int32.MaxValue; string clientAnnotationType = ""; if (clientAnnotations != null && clientAnnotations.Count > 0) { clientAnnotationSize = clientAnnotations.Count; clientAnnotation = clientAnnotations[0]; clientAnnotationBegin = clientAnnotation.begin; clientAnnotationEnd = clientAnnotation.end; clientAnnotationType = clientAnnotation.type; } edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(text); PipelineDispenser.StanfordPipeline.annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { List <CoreLabel> tokens = JavaExtensions.ToList <CoreLabel>((java.util.List)document.get(typeof(TokensAnnotation))); foreach (CoreLabel token in tokens) { int tokenBegin = token.beginPosition(); int tokenEnd = token.endPosition(); string chosenNer = "O"; if (isContainedIn(tokenBegin, tokenEnd, clientAnnotationBegin, clientAnnotationEnd)) { chosenNer = clientAnnotationType; if (tokenEnd == clientAnnotationEnd) { clientAnnotationNumber++; if (clientAnnotationNumber < clientAnnotationSize) { clientAnnotation = clientAnnotations[clientAnnotationNumber]; clientAnnotationBegin = clientAnnotation.begin; clientAnnotationEnd = clientAnnotation.end; clientAnnotationType = clientAnnotation.type; } } } fulltext += (token.value() + " " + chosenNer + Environment.NewLine); } fulltext += Environment.NewLine; } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter conllFile = new StreamWriter(file.FullName, false)) { conllFile.WriteLine("###THIS IS A COMMENT BLOCK###"); conllFile.WriteLine("###FORMAT: " + type + " ###"); conllFile.WriteLine(fulltext); } } catch (Exception e) { return(false); } return(true); } else if (type == "luis") { var newFilename = Path.ChangeExtension(originalFilename, ".lou"); string fulltext = ""; if (clientAnnotations != null) { foreach (Annotation clientAnnotation in clientAnnotations) { EntityMention em = new EntityMention(); em.begin = clientAnnotation.begin; em.end = clientAnnotation.end - 1; em.type = clientAnnotation.type; fulltext += ( "{" + "\"entity\": \"" + em.type + "\", \"startPos\": " + em.begin + ", \"endPos\": " + em.end + "}," + "\n" ); } } try { string filePath = null; if (ConfigurationManager.AppSettings["environment"] == Debug) { filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename); } if (ConfigurationManager.AppSettings["environment"] == Release) { filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename; } System.IO.FileInfo file = new System.IO.FileInfo(filePath); file.Directory.Create(); using (StreamWriter annFile = new StreamWriter(file.FullName, false)) { annFile.WriteLine("###THIS IS A COMMENT BLOCK###"); annFile.WriteLine("###FORMAT: " + type + " ###"); annFile.WriteLine(fulltext); } return(true); } catch (Exception e) { return(false); } } else { return(false); } }