Пример #1
0
        public async System.Threading.Tasks.Task <string> ShowTopicsAsync(Models.Text doc)
        {
            string text     = doc.RawText;
            var    url      = ConfigurationManager.AppSettings["TopicServiceUrl"];
            var    endpoint = "/topics";
            var    fullUrl  = url + endpoint;

            using (var client = new HttpClient())
            {
                //set up client
                client.BaseAddress = new Uri(url);
                client.DefaultRequestHeaders.Clear();
                client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));
                client.Timeout = TimeSpan.FromMinutes(10);

                var nvc   = new List <KeyValuePair <string, string> >();
                var texts = doc.RawText;
                nvc.Add(new KeyValuePair <string, string>("texts", texts));

                var req = new HttpRequestMessage(HttpMethod.Post, fullUrl)
                {
                    Content = new FormUrlEncodedContent(nvc)
                };

                var response = await client.SendAsync(req);

                var result = await response.Content.ReadAsStringAsync();

                return(result);
            }
        }
Пример #2
0
        public string IdentifyLanguage(Models.Text doc)
        {
            string text = doc.RawText;
            var    li   = LanguageIdentifier.New(AppDomain.CurrentDomain.BaseDirectory + "/classifiers/LangId/langprofiles-word-1_5-nfc-10k.bin.gz", "Vector", -1);
            var    lang = li.Identify(text); // Calling the language identifier -- it

            return(lang);
        }
Пример #3
0
        public string SplitSentences(Models.Text doc)
        {
            List <Models.Text> toReturn = new List <Models.Text>();
            string             fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                Models.Text sentenceObject = new Text();
                sentenceObject.RawText = (string)sentence.get(typeof(TextAnnotation));
                toReturn.Add(sentenceObject);
            }
            return(JsonConvert.SerializeObject(toReturn));
        }
Пример #4
0
        public string ParseHtml(Models.Text doc)
        {
            string fulltext = doc.RawText;

            if (fulltext.StartsWith("<body>"))
            {
                string newFullText          = "";
                var    parser               = new HtmlParser();
                var    htmlDocument         = parser.ParseDocument(fulltext);
                var    paragraphCssSelector = htmlDocument.QuerySelectorAll("p");
                foreach (var item in paragraphCssSelector)
                {
                    newFullText += item.TextContent + Environment.NewLine;
                }
                fulltext = newFullText;
            }
            return(fulltext);
        }
Пример #5
0
        public string SuggestEntityMentions(Models.Text doc)
        {
            string fulltext = doc.RawText;

            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap>         entityMentions = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(MentionsAnnotation)));
            List <Bean.Annotation> annotations    = new List <Bean.Annotation>();

            foreach (CoreMap entityMention in entityMentions)
            {
                Bean.Annotation annotation = new Bean.Annotation();
                annotation.begin = ((Integer)entityMention.get(typeof(CharacterOffsetBeginAnnotation))).intValue();
                annotation.end   = ((Integer)entityMention.get(typeof(CharacterOffsetEndAnnotation))).intValue();
                annotation.type  = (string)entityMention.get(typeof(NamedEntityTagAnnotation));
                annotations.Add(annotation);
            }
            return(JsonConvert.SerializeObject(annotations));
        }
Пример #6
0
        public string SpellCorrect(Models.Text doc)
        {
            string fulltext = doc.RawText;

            // These next two lines really should not be done per call.  They should be moved to startup
            var distance         = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\my_output_model.bin");
            var spellingDistance = new Distance(AppDomain.CurrentDomain.BaseDirectory + "\\wordvec\\spelling_model.bin");

            // Here, we manipulate fulltext if there are spelling errors present
            // then we return the edited text

            // reconstruct it maybe?
            string correctedText = "";

            // fetch tokenization for the document as we are correcting individual words
            edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(fulltext);
            PipelineDispenser.GetNewPipeline().annotate(document);
            List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));

            foreach (CoreMap sentence in sentences)
            {
                foreach (CoreLabel token in JavaExtensions.ToList <CoreMap>((java.util.List)sentence.get(typeof(TokensAnnotation))))
                {
                    // we have to look this token up in both normal word space as well as spelling word space
                    // at that point, we would do the mathematics to compute the resultant word vector

                    /*You have something like:
                     *
                     * [reliable] - [relieable] + [foriegn] ==> [foreign]
                     * To generalise this approach(make it less reliant on reliable…),
                     * we can build a spelling transformation vector by taking the average
                     * difference between a set of pairs of correct and incorrectly spelled words.
                     * We can then fix a spelling mistake by subtracting this spelling transformation
                     * vector from the incorrectly spelled word vector and finding the word closest
                     * to where we end up.*/

                    BestWord[] bestwords         = distance.Search(token.word());
                    BestWord[] spellingBestwords = spellingDistance.Search(token.word());

                    if (bestwords.Length == 0)
                    {
                        string correction = token.word();

                        // we assume there might be a spelling mistake
                        if (spellingBestwords.Length != 0)
                        {
                            correction = spellingBestwords[0].Word;
                        }

                        // We have to make a proper decision on the next line
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + correction;
                    }
                    else
                    {
                        // we assume that this is spelled right since our main vector knows of it

                        // this is really not the correct way to construct the doucment because space is not
                        // always the appropriate whitespace.
                        if (correctedText.Length > 0)
                        {
                            correctedText += " ";
                        }
                        correctedText = correctedText + token.word();
                    }
                }
            }

            return(correctedText);
        }
Пример #7
0
 public TextViewModel(string str)
 {
     Model = new Models.Text(str);
 }