        private void button1_Click(object sender, EventArgs e)
            textBox2.Text = "";
            string input = textBox1.Text;

            string[] output = senTokenizer.Tokenize(input).ToArray();
            for (int i = 0; i < output.Length; i++)
                textBox2.Text = textBox2.Text + output[i] + Environment.NewLine;
        public void TestSentenceTokenization()
            string        text      = @"It's easy to understand for developers and 'plays well with everything'.
Don't underestimate the importance of toolchain support. Want to spin up a Ruby microservice which speaks HTTP? Sinatra and you're done. Go? Whatever the Go HTTP library is and you're done. Need to interact with it from the command line? Curl and you're done. How about from an automated testing script written in Ruby? Net:HTTP/HTTParty and you're done. Thinking about how to deploy it vis-a-vis firewall/etc? Port 80 and you're done. Need coarse-grained access logging? Built into Nginx/etc already; you're done. Uptime monitoring? Expose a /monitor endpoint; provide URL to existing monitoring solution; you're done. Deployment orchestration? Use Capistrano/shell scripts/whatever you already use for the app proper and you're done. Encrypted transport? HTTPS and you're done. Auth/auth? A few options that you're very well-acquainted with and are known to be consumable on both ends of the service trivially.
Edit to note: I'm assuming, in the above, that one is already sold on the benefits of a microservice architecture for one's particular app/team and is deciding on transport layer for the architecture. FWIW, I run ~4 distinct applications, and most of them are comparatively large monolithic Rails apps. My company's bookkeeping runs in the same memory space as bingo card PDF generation.
Things that would tilt me more towards microservices include a very rapid deployment pace, large engineering organizations which multiple distinct teams which each want to be able to control deploys/architecture decisions, particular hotspots in the application which just don't play well with one's main technology stack (as apparently happened in the app featured in this article), etc.";
            List <string> sentences = SentenceTokenizer.Tokenize(text);

            Assert.AreEqual(sentences.Count > 2, true);
        public void TokenizeTest()
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;
            string[] expected, actual;

            input = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
        public void TokenizeTest()
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;

            string[] expected, actual;

            input    = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual   = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
文件: wordmap.cs 项目: qa1/wordmap
        public Dictionary <string, int> GetSentences(string text)
            Dictionary <string, int> SentencesDic = new Dictionary <string, int>();
            int SentCount = 0;

            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string[] sentences = senTokenizer.Tokenize(text).ToArray();

            foreach (string sentence in sentences)
                if (!SentencesDic.ContainsKey(sentence))
                    SentencesDic.Add(sentence, SentCount++);

         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
        public List <SentenceObj> GetTopSentences(int N)
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                                if (!stopWords.Contains(word.ToLower()))
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                                currMax      = avgScore;
                                bestSentence = sentence;
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                catch (Exception ex)
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
                SentenceObj sentenceObj = new SentenceObj()
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();