public async System.Threading.Tasks.Task <string> ShowTopicsAsync(Models.Text doc) { string text = doc.RawText; var url = ConfigurationManager.AppSettings["TopicServiceUrl"]; var endpoint = "/topics"; var fullUrl = url + endpoint; using (var client = new HttpClient()) { //set up client client.BaseAddress = new Uri(url); client.DefaultRequestHeaders.Clear(); client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); client.Timeout = TimeSpan.FromMinutes(10); var nvc = new List <KeyValuePair <string, string> >(); var texts = doc.RawText; nvc.Add(new KeyValuePair <string, string>("texts", texts)); var req = new HttpRequestMessage(HttpMethod.Post, fullUrl) { Content = new FormUrlEncodedContent(nvc) }; var response = await client.SendAsync(req); var result = await response.Content.ReadAsStringAsync(); return(result); } }
public string IdentifyLanguage(Models.Text doc) { string text = doc.RawText; var li = LanguageIdentifier.New(AppDomain.CurrentDomain.BaseDirectory + "/classifiers/LangId/langprofiles-word-1_5-nfc-10k.bin.gz", "Vector", -1); var lang = li.Identify(text); // Calling the language identifier -- it return(lang); }
public string SplitSentences(Models.Text doc) { List <Models.Text> toReturn = new List <Models.Text>(); string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { Models.Text sentenceObject = new Text(); sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation)); toReturn.Add(sentenceObject); } return(JsonConvert.SerializeObject(toReturn)); }
public string ParseHtml(Models.Text doc) { string fulltext = doc.RawText; if (fulltext.StartsWith("<body>")) { string newFullText = ""; var parser = new HtmlParser(); var htmlDocument = parser.ParseDocument(fulltext); var paragraphCssSelector = htmlDocument.QuerySelectorAll("p"); foreach (var item in paragraphCssSelector) { newFullText += item.TextContent + Environment.NewLine; } fulltext = newFullText; } return(fulltext); }
public string SuggestEntityMentions(Models.Text doc) { string fulltext = doc.RawText; edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation))); List <Bean.Annotation> annotations = new List <Bean.Annotation>(); foreach (CoreMap entityMention in entityMentions) { Bean.Annotation annotation = new Bean.Annotation(); annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue(); annotation.end = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue(); annotation.type = (string)entityMention.get(typeof(NamedEntityTagAnnotation)); annotations.Add(annotation); } return(JsonConvert.SerializeObject(annotations)); }
public string SpellCorrect(Models.Text doc) { string fulltext = doc.RawText; // These next two lines really should not be done per call. They should be moved to startup var distance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin"); var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin"); // Here, we manipulate fulltext if there are spelling errors present // then we return the edited text // reconstruct it maybe? string correctedText = ""; // fetch tokenization for the document as we are correcting individual words edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext); PipelineDispenser.GetNewPipeline().annotate(document); List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation))); foreach (CoreMap sentence in sentences) { foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation)))) { // we have to look this token up in both normal word space as well as spelling word space // at that point, we would do the mathematics to compute the resultant word vector /*You have something like: * * [reliable] - [relieable] + [foriegn] ==> [foreign] * To generalise this approach(make it less reliant on reliable…), * we can build a spelling transformation vector by taking the average * difference between a set of pairs of correct and incorrectly spelled words. * We can then fix a spelling mistake by subtracting this spelling transformation * vector from the incorrectly spelled word vector and finding the word closest * to where we end up.*/ BestWord[] bestwords = distance.Search(token.word()); BestWord[] spellingBestwords = spellingDistance.Search(token.word()); if (bestwords.Length == 0) { string correction = token.word(); // we assume there might be a spelling mistake if (spellingBestwords.Length != 0) { correction = spellingBestwords[0].Word; } // We have to make a proper decision on the next line if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + correction; } else { // we assume that this is spelled right since our main vector knows of it // this is really not the correct way to construct the doucment because space is not // always the appropriate whitespace. if (correctedText.Length > 0) { correctedText += " "; } correctedText = correctedText + token.word(); } } } return(correctedText); }
public TextViewModel(string str) { Model = new Models.Text(str); }