public string SplitSentences(Models.Text doc) { List <Models.Text> toReturn = new List <Models.Text>(); string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { Models.Text sentenceObject = new Text(); sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation)); toReturn.Add(sentenceObject); } return(JsonConvert.SerializeObject(toReturn)); }
public string SuggestEntityMentions(Models.Text doc) { string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation))); List <Bean.Annotation> annotations = new List <Bean.Annotation>(); foreach (CoreMap entityMention in entityMentions) { Bean.Annotation annotation = new Bean.Annotation(); annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue(); annotation.end = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue(); annotation.type = (string)entityMention.get(typeof(NamedEntityTagAnnotation)); annotations.Add(annotation); } return(JsonConvert.SerializeObject(annotations)); }
public string SpellCorrect(Models.Text doc) { string fulltext = doc.RawText; // These next two lines really should not be done per call. They should be moved to startup var distance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin"); var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin"); // Here, we manipulate fulltext if there are spelling errors present // then we return the edited text // reconstruct it maybe? string correctedText = ""; // fetch tokenization for the document as we are correcting individual words edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation)))) { // we have to look this token up in both normal word space as well as spelling word space // at that point, we would do the mathematics to compute the resultant word vector /*You have something like: * * [reliable] - [relieable] + [foriegn] ==> [foreign] * To generalise this approach(make it less reliant on reliable…), * we can build a spelling transformation vector by taking the average * difference between a set of pairs of correct and incorrectly spelled words. * We can then fix a spelling mistake by subtracting this spelling transformation * vector from the incorrectly spelled word vector and finding the word closest * to where we end up.*/ BestWord[] bestwords = distance.Search(token.word()); BestWord[] spellingBestwords = spellingDistance.Search(token.word()); if (bestwords.Length == 0) { string correction = token.word(); // we assume there might be a spelling mistake if (spellingBestwords.Length != 0) { correction = spellingBestwords[0].Word; } // We have to make a proper decision on the next line if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + correction; } else { // we assume that this is spelled right since our main vector knows of it // this is really not the correct way to construct the doucment because space is not // always the appropriate whitespace. if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + token.word(); } } } return(correctedText); }