SentenceTokenizer C# (CSharp) Code-Beispiele

Beispiel #1

0

Datei anzeigen

        public bool TrainClusteringModels(MySqlDataManipulator manipulator, int companyId, List <string> examplesIn, bool training = false)
        {
            List <KeywordExample> trainingData = new List <KeywordExample>();

            foreach (string sentence in examplesIn)
            {
                List <string>         tokens       = SentenceTokenizer.TokenizeSentence(sentence);
                List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
                List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
                KeywordExample        example      = new KeywordExample();
                foreach (string keyword in keywords)
                {
                    example.AddKeyword(keyword);
                }
                trainingData.Add(example);
            }
            KeywordClusterer.Train(trainingData);
            if (!training)
            {
                return(KeywordClusterer.Save(manipulator, companyId));
            }
            else
            {
                return(true);
            }
        }

Beispiel #2

0

Datei anzeigen

        public List <string> ProcessQuery(MechanicQuery queryIn)
        {
            List <List <string> > complaintTokens = PartOfSpeechTagger.Tag(
                SentenceTokenizer.TokenizeSentence(queryIn.Complaint)
                );
            List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTokens);
            KeywordExample ex       = new KeywordExample();

            foreach (string s in keywords)
            {
                ex.AddKeyword(s);
            }
            List <int>    complaintGroups = KeywordClusterer.PredictTopNSimilarGroups(ex, NUMBER_COMPLAINT_GROUPS);
            List <object> queryDataPoint  = new List <object> {
                queryIn.Make, queryIn.Model
            };

            foreach (int x in complaintGroups)
            {
                queryDataPoint.Add(x);
            }
            List <object> predictedProblems = ProblemPredictor.PredictTopN(queryDataPoint, CalculateDistance, NUMBER_QUERIES_OUT);
            List <string> returnProblems    = new List <string>();

            foreach (object o in predictedProblems)
            {
                returnProblems.Add((string)o);
            }
            return(returnProblems);
        }

Beispiel #3

0

Datei anzeigen

        private bool RestoreKeywordClusterer()
        {
            List <MechanicQuery>  mechanicQueries  = DataSource.LoadMechanicQueries();
            List <KeywordExample> trainingExamples = new List <KeywordExample>();

            foreach (MechanicQuery query in mechanicQueries)
            {
                List <List <string> > complaintTags = PartOfSpeechTagger.Tag(
                    SentenceTokenizer.TokenizeSentence(query.Complaint.ToLower())
                    );
                List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTags);
                KeywordExample example  = new KeywordExample();
                foreach (string s in keywords)
                {
                    example.AddKeyword(s);
                }
                trainingExamples.Add(example);
            }
            KeywordClusterer.Train(trainingExamples);
            try
            {
                AnsEncoderStream streamOut = new AnsEncoderStream(
                    new FileStream(DefaultModelFileLocations.KEYWORD_SIMILARITY_CLUSTERER_FILE, FileMode.Create, FileAccess.Write),
                    1048576,
                    4096
                    );
                KeywordClusterer.Save(streamOut);
                streamOut.Close();
            } catch (IOException)
            {
                return(false);
            }
            return(true);
        }

Beispiel #4

0

Datei anzeigen

Datei: SentenceTokenizerTest.cs Projekt: mcaravind/TextMinion.NET

        public void VerifyParseForExampleAbbreviation()
        {
            var sentences =
                SentenceTokenizer.GetSentences(
                    "I know this sounds speculative and vague, because honestly I have no idea what I'm talking about. But China certainly has the motive to manipulate American markets. And since the foreign investment is asymmetrical (e.g. the Chinese can invest in NYSE, but Americans cannot invest in Shanghai), China has an opportunity to play both sides of a Shanghai market crash and any corresponding American market drop.");

            Assert.AreEqual(sentences.Count, 3);
        }

Beispiel #5

0

Datei anzeigen

 public Form1()
 {
     InitializeComponent();
     lines         = System.IO.File.ReadAllLines("stopwords.txt");
     normalizer    = new Normalizer(true, false, false);
     senTokenizer  = new SentenceTokenizer();
     wordTokenizer = new WordTokenizer(true);
     tagger        = new POSTagger();
 }

Beispiel #6

0

Datei anzeigen

public void TestSentenceTokenization()
{
string text = @"It's easy to understand for developers and 'plays well with everything'.
Don't underestimate the importance of toolchain support. Want to spin up a Ruby microservice which speaks HTTP? Sinatra and you're done. Go? Whatever the Go HTTP library is and you're done. Need to interact with it from the command line? Curl and you're done. How about from an automated testing script written in Ruby? Net:HTTP/HTTParty and you're done. Thinking about how to deploy it vis-a-vis firewall/etc? Port 80 and you're done. Need coarse-grained access logging? Built into Nginx/etc already; you're done. Uptime monitoring? Expose a /monitor endpoint; provide URL to existing monitoring solution; you're done. Deployment orchestration? Use Capistrano/shell scripts/whatever you already use for the app proper and you're done. Encrypted transport? HTTPS and you're done. Auth/auth? A few options that you're very well-acquainted with and are known to be consumable on both ends of the service trivially.
Edit to note: I'm assuming, in the above, that one is already sold on the benefits of a microservice architecture for one's particular app/team and is deciding on transport layer for the architecture. FWIW, I run ~4 distinct applications, and most of them are comparatively large monolithic Rails apps. My company's bookkeeping runs in the same memory space as bingo card PDF generation.
Things that would tilt me more towards microservices include a very rapid deployment pace, large engineering organizations which multiple distinct teams which each want to be able to control deploys/architecture decisions, particular hotspots in the application which just don't play well with one's main technology stack (as apparently happened in the app featured in this article), etc.";
List <string> sentences = SentenceTokenizer.Tokenize(text);

Assert.AreEqual(sentences.Count > 2, true);
}

Beispiel #7

0

Datei anzeigen

        /// <summary>
        /// Attempts to return a list of the top 3 most similar complaint groups from the database
        /// </summary>
        /// <param name="entryIn">The query to predict the most similar complaint groups of</param>
        /// <param name="manipulator">The object to use to access the database</param>
        /// <param name="companyId">The id of the company the request is being made for. Determines which tables to use in the database</param>
        /// <returns>Json formatted string that contains the top 3 complaint groups that are most similar to the query made, and their database ids</returns>
        public string ProcessQueryForComplaintGroups(RepairJobEntry entryIn, MySqlDataManipulator manipulator, int companyId, int numGroupsRequested = 3)
        {
            List <string>         tokens       = SentenceTokenizer.TokenizeSentence(entryIn.Complaint);
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
            List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
            KeywordExample        example      = new KeywordExample();

            foreach (string keyword in keywords)
            {
                example.AddKeyword(keyword);
            }
            KeywordClusterer.Load(manipulator, companyId);
            List <int> groups = KeywordClusterer.PredictTopNSimilarGroups(example, numGroupsRequested);
            List <KeywordGroupEntry> companyComplaintGroups = manipulator.GetCompanyComplaintGroups(companyId);

            if (companyComplaintGroups == null)
            {
                throw new NullReferenceException("Company " + companyId + " complaint groups were not available in database");
            }
            List <KeywordGroupEntry> ret = new List <KeywordGroupEntry>();
            bool uncategorizedAdded      = false;

            foreach (int i in groups)
            {
                if (i == 0 && !uncategorizedAdded)
                {
                    ret.Add(new KeywordGroupEntry("Uncategorized")
                    {
                        Id = 0
                    });
                    uncategorizedAdded = true;
                }
                else if (i != 0)
                {
                    companyComplaintGroups[i - 1].Id = i;
                    ret.Add(companyComplaintGroups[i - 1]);
                }
            }
            JsonListStringConstructor constructor = new JsonListStringConstructor();

            ret.ForEach(obj => constructor.AddElement(ConvertKeywordGroupEntry(obj)));
            return(constructor.ToString());

            JsonDictionaryStringConstructor ConvertKeywordGroupEntry(KeywordGroupEntry e)
            {
                JsonDictionaryStringConstructor r = new JsonDictionaryStringConstructor();

                r.SetMapping("GroupDefinition", e.GroupDefinition);
                r.SetMapping("Id", e.Id);
                return(r);
            }
        }

Beispiel #8

0

Datei anzeigen

Datei: Smart.cs Projekt: renyh1013/dp2

 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
     // 不再需要LowerCaseFilter，因为SegTokenFilter已经将所有英文字符转换成小写
     // stem太严格了, This is not bug, this feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null)
     {
         result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
     }
     return result;
 }

Beispiel #9

0

Datei anzeigen

        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new SentenceTokenizer(reader);

            result = new WordTokenizer(result, wordSegment);
            // result = new LowerCaseFilter(result);
            // 不再需要LowerCaseFilter，因为SegTokenFilter已经将所有英文字符转换成小写
            // stem太严格了, This is not bug, this feature:)
            result = new PorterStemFilter(result);
            if (stopWords != null)
            {
                result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
            }
            return(result);
        }

Beispiel #10

0

Datei anzeigen

        public List <string> PredictKeywordsInJobData(RepairJobEntry entry, bool complaint = true)
        {
            List <string> tokens;

            if (complaint)
            {
                tokens = SentenceTokenizer.TokenizeSentence(entry.Complaint);
            }
            else
            {
                tokens = SentenceTokenizer.TokenizeSentence(entry.Problem);
            }
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);

            return(KeywordPredictor.PredictKeywords(taggedTokens));
        }

Beispiel #11

0

Datei anzeigen

Datei: SentenceTokenizerTests.cs Projekt: najafzadeh1055/NHazm

        public void TokenizeTest()
        {
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;
            string[] expected, actual;

            input = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
            }
        }

Beispiel #12

0

Datei anzeigen

Datei: SentenceTokenizerTests.cs Projekt: avestura/PersianNews

        public void TokenizeTest()
        {
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;

            string[] expected, actual;

            input    = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual   = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
            }
        }

Beispiel #13

0

Datei anzeigen

        private bool RestoreProblemPredictor()
        {
            List <MechanicQuery>  mechanicQueries  = DataSource.LoadMechanicQueries();
            List <List <object> > trainingExamples = new List <List <object> >();
            List <object>         targetExamples   = new List <object>();

            foreach (MechanicQuery query in mechanicQueries)
            {
                List <object> currExample = new List <object>();
                currExample.Add(query.Make);
                currExample.Add(query.Model);
                List <List <string> > complaintTags = PartOfSpeechTagger.Tag(
                    SentenceTokenizer.TokenizeSentence(query.Complaint.ToLower())
                    );
                List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTags);
                KeywordExample example  = new KeywordExample();
                foreach (string s in keywords)
                {
                    example.AddKeyword(s);
                }
                List <int> groupsOut = KeywordClusterer.PredictTopNSimilarGroups(example, NUMBER_COMPLAINT_GROUPS);
                foreach (int i in groupsOut)
                {
                    currExample.Add(i);
                }
                trainingExamples.Add(currExample);
                targetExamples.Add(query.Problem.ToLower());
            }
            ProblemPredictor.Train(trainingExamples, targetExamples);
            try
            {
                AnsEncoderStream saveStream = new AnsEncoderStream(
                    new FileStream(DefaultModelFileLocations.KNN_QUERY_PROBLEM_PREDICTOR_FILE, FileMode.Create, FileAccess.Write),
                    1048576,
                    4096
                    );
                ProblemPredictor.Save(saveStream);
                saveStream.Flush();
                saveStream.Close();
            } catch (IOException)
            {
                return(false);
            }
            return(true);
        }

Beispiel #14

0

Datei anzeigen

        public string ProcessQueryForSimilarQueriesArchive(RepairJobEntry entryIn, MySqlDataManipulator manipulator, int companyId, int problemGroupId, int numRequested, int offset = 0)
        {
            List <string>         tokens       = SentenceTokenizer.TokenizeSentence(entryIn.Problem);
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
            List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
            KeywordExample        example      = new KeywordExample();

            foreach (string keyword in keywords)
            {
                example.AddKeyword(keyword);
            }
            KeywordClusterer.Load(manipulator, companyId);
            List <int> groups = KeywordClusterer.PredictTopNSimilarGroups(example, 3);

            entryIn.ComplaintGroups = "[" + string.Join(',', groups) + "]";
            List <RepairJobEntry>     potentials     = manipulator.GetDataEntriesByProblemGroup(companyId, problemGroupId);
            List <EntrySimilarity>    ret            = ProblemPredictor.GetQueryResults(entryIn, potentials, numRequested, offset);
            JsonListStringConstructor retConstructor = new JsonListStringConstructor();

            ret.ForEach(obj => retConstructor.AddElement(ConvertEntrySimilarity(obj)));
            return(retConstructor.ToString());


            JsonDictionaryStringConstructor ConvertEntrySimilarity(EntrySimilarity e)
            {
                JsonDictionaryStringConstructor r = new JsonDictionaryStringConstructor();

                r.SetMapping("Make", e.Entry.Make);
                r.SetMapping("Model", e.Entry.Model);
                r.SetMapping("Complaint", e.Entry.Complaint);
                r.SetMapping("Problem", e.Entry.Problem);
                if (e.Entry.Year == -1)
                {
                    r.SetMapping("Year", "Unknown");
                }
                else
                {
                    r.SetMapping("Year", e.Entry.Year);
                }
                r.SetMapping("Id", e.Entry.Id);
                r.SetMapping("Difference", e.Difference);
                return(r);
            }
        }

Beispiel #15

0

Datei anzeigen

Datei: wordmap.cs Projekt: qa1/wordmap

        public Dictionary <string, int> GetSentences(string text)
        {
            Dictionary <string, int> SentencesDic = new Dictionary <string, int>();
            int SentCount = 0;

            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string[] sentences = senTokenizer.Tokenize(text).ToArray();

            foreach (string sentence in sentences)
            {
                if (!SentencesDic.ContainsKey(sentence))
                {
                    SentencesDic.Add(sentence, SentCount++);
                }
            }

            return(SentencesDic);
        }

Beispiel #16

0

Datei anzeigen

Datei: RepairJobAPI.cs Projekt: Freitacr/Mechanics-Assistant-Server

        private static float CalcSimilarity(RepairJobEntry query, RepairJobEntry other)
        {
            IKeywordPredictor        keyPred = NaiveBayesKeywordPredictor.GetGlobalModel();
            AveragedPerceptronTagger tagger  = AveragedPerceptronTagger.GetTagger();
            List <String>            tokened = SentenceTokenizer.TokenizeSentence(query.Complaint);
            List <List <String> >    tagged  = tagger.Tag(tokened);
            List <String>            InputComplaintKeywords = keyPred.PredictKeywords(tagged);

            tokened = SentenceTokenizer.TokenizeSentence(query.Problem);
            tagged  = tagger.Tag(tokened);
            List <String> InputProblemKeywords = keyPred.PredictKeywords(tagged);
            float         score = 0;

            tokened = SentenceTokenizer.TokenizeSentence(other.Complaint);
            tagged  = tagger.Tag(tokened);
            List <String> JobComplaintKeywords = keyPred.PredictKeywords(tagged);

            tokened = SentenceTokenizer.TokenizeSentence(other.Problem);
            tagged  = tagger.Tag(tokened);
            List <String> JobProblemKeywords = keyPred.PredictKeywords(tagged);

            foreach (String keyword in JobComplaintKeywords)
            {
                if (InputComplaintKeywords.Contains(keyword))
                {
                    score++;
                }
            }
            foreach (String keyword in JobProblemKeywords)
            {
                if (InputProblemKeywords.Contains(keyword))
                {
                    score++;
                }
            }
            return(score / (JobComplaintKeywords.Count + JobProblemKeywords.Count));
        }

Beispiel #17

0

Datei anzeigen

Datei: SentenceTokenizerTest.cs Projekt: mcaravind/TextMinion.NET

        public void VerifyParseForTwoSentences()
        {
            var sentences = SentenceTokenizer.GetSentences("This is a test sentence. In addition there is a second sentence.");

            Assert.AreEqual(sentences.Count, 2);
        }

Beispiel #18

0

Datei anzeigen

Datei: FullStory.cs Projekt: wyrover/HackerNewsSummary

        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }

C# (CSharp) SentenceTokenizer Beispiele