Beispiel #1
0
        public bool TrainClusteringModels(MySqlDataManipulator manipulator, int companyId, List <string> examplesIn, bool training = false)
        {
            List <KeywordExample> trainingData = new List <KeywordExample>();

            foreach (string sentence in examplesIn)
            {
                List <string>         tokens       = SentenceTokenizer.TokenizeSentence(sentence);
                List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
                List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
                KeywordExample        example      = new KeywordExample();
                foreach (string keyword in keywords)
                {
                    example.AddKeyword(keyword);
                }
                trainingData.Add(example);
            }
            KeywordClusterer.Train(trainingData);
            if (!training)
            {
                return(KeywordClusterer.Save(manipulator, companyId));
            }
            else
            {
                return(true);
            }
        }
Beispiel #2
0
        public List <string> ProcessQuery(MechanicQuery queryIn)
        {
            List <List <string> > complaintTokens = PartOfSpeechTagger.Tag(
                SentenceTokenizer.TokenizeSentence(queryIn.Complaint)
                );
            List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTokens);
            KeywordExample ex       = new KeywordExample();

            foreach (string s in keywords)
            {
                ex.AddKeyword(s);
            }
            List <int>    complaintGroups = KeywordClusterer.PredictTopNSimilarGroups(ex, NUMBER_COMPLAINT_GROUPS);
            List <object> queryDataPoint  = new List <object> {
                queryIn.Make, queryIn.Model
            };

            foreach (int x in complaintGroups)
            {
                queryDataPoint.Add(x);
            }
            List <object> predictedProblems = ProblemPredictor.PredictTopN(queryDataPoint, CalculateDistance, NUMBER_QUERIES_OUT);
            List <string> returnProblems    = new List <string>();

            foreach (object o in predictedProblems)
            {
                returnProblems.Add((string)o);
            }
            return(returnProblems);
        }
Beispiel #3
0
        private bool RestoreKeywordClusterer()
        {
            List <MechanicQuery>  mechanicQueries  = DataSource.LoadMechanicQueries();
            List <KeywordExample> trainingExamples = new List <KeywordExample>();

            foreach (MechanicQuery query in mechanicQueries)
            {
                List <List <string> > complaintTags = PartOfSpeechTagger.Tag(
                    SentenceTokenizer.TokenizeSentence(query.Complaint.ToLower())
                    );
                List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTags);
                KeywordExample example  = new KeywordExample();
                foreach (string s in keywords)
                {
                    example.AddKeyword(s);
                }
                trainingExamples.Add(example);
            }
            KeywordClusterer.Train(trainingExamples);
            try
            {
                AnsEncoderStream streamOut = new AnsEncoderStream(
                    new FileStream(DefaultModelFileLocations.KEYWORD_SIMILARITY_CLUSTERER_FILE, FileMode.Create, FileAccess.Write),
                    1048576,
                    4096
                    );
                KeywordClusterer.Save(streamOut);
                streamOut.Close();
            } catch (IOException)
            {
                return(false);
            }
            return(true);
        }
        public void VerifyParseForExampleAbbreviation()
        {
            var sentences =
                SentenceTokenizer.GetSentences(
                    "I know this sounds speculative and vague, because honestly I have no idea what I'm talking about. But China certainly has the motive to manipulate American markets. And since the foreign investment is asymmetrical (e.g. the Chinese can invest in NYSE, but Americans cannot invest in Shanghai), China has an opportunity to play both sides of a Shanghai market crash and any corresponding American market drop.");

            Assert.AreEqual(sentences.Count, 3);
        }
Beispiel #5
0
 public Form1()
 {
     InitializeComponent();
     lines         = System.IO.File.ReadAllLines("stopwords.txt");
     normalizer    = new Normalizer(true, false, false);
     senTokenizer  = new SentenceTokenizer();
     wordTokenizer = new WordTokenizer(true);
     tagger        = new POSTagger();
 }
Beispiel #6
0
        public void TestSentenceTokenization()
        {
            string        text      = @"It's easy to understand for developers and 'plays well with everything'.
Don't underestimate the importance of toolchain support. Want to spin up a Ruby microservice which speaks HTTP? Sinatra and you're done. Go? Whatever the Go HTTP library is and you're done. Need to interact with it from the command line? Curl and you're done. How about from an automated testing script written in Ruby? Net:HTTP/HTTParty and you're done. Thinking about how to deploy it vis-a-vis firewall/etc? Port 80 and you're done. Need coarse-grained access logging? Built into Nginx/etc already; you're done. Uptime monitoring? Expose a /monitor endpoint; provide URL to existing monitoring solution; you're done. Deployment orchestration? Use Capistrano/shell scripts/whatever you already use for the app proper and you're done. Encrypted transport? HTTPS and you're done. Auth/auth? A few options that you're very well-acquainted with and are known to be consumable on both ends of the service trivially.
Edit to note: I'm assuming, in the above, that one is already sold on the benefits of a microservice architecture for one's particular app/team and is deciding on transport layer for the architecture. FWIW, I run ~4 distinct applications, and most of them are comparatively large monolithic Rails apps. My company's bookkeeping runs in the same memory space as bingo card PDF generation.
Things that would tilt me more towards microservices include a very rapid deployment pace, large engineering organizations which multiple distinct teams which each want to be able to control deploys/architecture decisions, particular hotspots in the application which just don't play well with one's main technology stack (as apparently happened in the app featured in this article), etc.";
            List <string> sentences = SentenceTokenizer.Tokenize(text);

            Assert.AreEqual(sentences.Count > 2, true);
        }
Beispiel #7
0
        /// <summary>
        /// Attempts to return a list of the top 3 most similar complaint groups from the database
        /// </summary>
        /// <param name="entryIn">The query to predict the most similar complaint groups of</param>
        /// <param name="manipulator">The object to use to access the database</param>
        /// <param name="companyId">The id of the company the request is being made for. Determines which tables to use in the database</param>
        /// <returns>Json formatted string that contains the top 3 complaint groups that are most similar to the query made, and their database ids</returns>
        public string ProcessQueryForComplaintGroups(RepairJobEntry entryIn, MySqlDataManipulator manipulator, int companyId, int numGroupsRequested = 3)
        {
            List <string>         tokens       = SentenceTokenizer.TokenizeSentence(entryIn.Complaint);
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
            List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
            KeywordExample        example      = new KeywordExample();

            foreach (string keyword in keywords)
            {
                example.AddKeyword(keyword);
            }
            KeywordClusterer.Load(manipulator, companyId);
            List <int> groups = KeywordClusterer.PredictTopNSimilarGroups(example, numGroupsRequested);
            List <KeywordGroupEntry> companyComplaintGroups = manipulator.GetCompanyComplaintGroups(companyId);

            if (companyComplaintGroups == null)
            {
                throw new NullReferenceException("Company " + companyId + " complaint groups were not available in database");
            }
            List <KeywordGroupEntry> ret = new List <KeywordGroupEntry>();
            bool uncategorizedAdded      = false;

            foreach (int i in groups)
            {
                if (i == 0 && !uncategorizedAdded)
                {
                    ret.Add(new KeywordGroupEntry("Uncategorized")
                    {
                        Id = 0
                    });
                    uncategorizedAdded = true;
                }
                else if (i != 0)
                {
                    companyComplaintGroups[i - 1].Id = i;
                    ret.Add(companyComplaintGroups[i - 1]);
                }
            }
            JsonListStringConstructor constructor = new JsonListStringConstructor();

            ret.ForEach(obj => constructor.AddElement(ConvertKeywordGroupEntry(obj)));
            return(constructor.ToString());

            JsonDictionaryStringConstructor ConvertKeywordGroupEntry(KeywordGroupEntry e)
            {
                JsonDictionaryStringConstructor r = new JsonDictionaryStringConstructor();

                r.SetMapping("GroupDefinition", e.GroupDefinition);
                r.SetMapping("Id", e.Id);
                return(r);
            }
        }
Beispiel #8
0
 public override TokenStream TokenStream(String fieldName, TextReader reader)
 {
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
     // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
     // stem太严格了, This is not bug, this feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null)
     {
         result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
     }
     return result;
 }
Beispiel #9
0
        public override TokenStream TokenStream(String fieldName, TextReader reader)
        {
            TokenStream result = new SentenceTokenizer(reader);

            result = new WordTokenizer(result, wordSegment);
            // result = new LowerCaseFilter(result);
            // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
            // stem太严格了, This is not bug, this feature:)
            result = new PorterStemFilter(result);
            if (stopWords != null)
            {
                result = new StopFilter(true, result, StopFilter.MakeStopSet(stopWords), false);
            }
            return(result);
        }
Beispiel #10
0
        public List <string> PredictKeywordsInJobData(RepairJobEntry entry, bool complaint = true)
        {
            List <string> tokens;

            if (complaint)
            {
                tokens = SentenceTokenizer.TokenizeSentence(entry.Complaint);
            }
            else
            {
                tokens = SentenceTokenizer.TokenizeSentence(entry.Problem);
            }
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);

            return(KeywordPredictor.PredictKeywords(taggedTokens));
        }
        public void TokenizeTest()
        {
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;
            string[] expected, actual;

            input = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
            }
        }
        public void TokenizeTest()
        {
            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string input;

            string[] expected, actual;

            input    = "جدا کردن ساده است. تقریبا البته!";
            expected = new string[] { "جدا کردن ساده است.", "تقریبا البته!" };
            actual   = senTokenizer.Tokenize(input).ToArray();
            Assert.AreEqual(expected.Length, actual.Length, "Failed to tokenize sentences of '" + input + "' passage");
            for (int i = 0; i < expected.Length; i++)
            {
                Assert.AreEqual(expected[i], actual[i], "Failed to tokenize sentences of '" + input + "' passage");
            }
        }
Beispiel #13
0
        private bool RestoreProblemPredictor()
        {
            List <MechanicQuery>  mechanicQueries  = DataSource.LoadMechanicQueries();
            List <List <object> > trainingExamples = new List <List <object> >();
            List <object>         targetExamples   = new List <object>();

            foreach (MechanicQuery query in mechanicQueries)
            {
                List <object> currExample = new List <object>();
                currExample.Add(query.Make);
                currExample.Add(query.Model);
                List <List <string> > complaintTags = PartOfSpeechTagger.Tag(
                    SentenceTokenizer.TokenizeSentence(query.Complaint.ToLower())
                    );
                List <string>  keywords = KeywordPredictor.PredictKeywords(complaintTags);
                KeywordExample example  = new KeywordExample();
                foreach (string s in keywords)
                {
                    example.AddKeyword(s);
                }
                List <int> groupsOut = KeywordClusterer.PredictTopNSimilarGroups(example, NUMBER_COMPLAINT_GROUPS);
                foreach (int i in groupsOut)
                {
                    currExample.Add(i);
                }
                trainingExamples.Add(currExample);
                targetExamples.Add(query.Problem.ToLower());
            }
            ProblemPredictor.Train(trainingExamples, targetExamples);
            try
            {
                AnsEncoderStream saveStream = new AnsEncoderStream(
                    new FileStream(DefaultModelFileLocations.KNN_QUERY_PROBLEM_PREDICTOR_FILE, FileMode.Create, FileAccess.Write),
                    1048576,
                    4096
                    );
                ProblemPredictor.Save(saveStream);
                saveStream.Flush();
                saveStream.Close();
            } catch (IOException)
            {
                return(false);
            }
            return(true);
        }
Beispiel #14
0
        public string ProcessQueryForSimilarQueriesArchive(RepairJobEntry entryIn, MySqlDataManipulator manipulator, int companyId, int problemGroupId, int numRequested, int offset = 0)
        {
            List <string>         tokens       = SentenceTokenizer.TokenizeSentence(entryIn.Problem);
            List <List <string> > taggedTokens = KeywordTagger.Tag(tokens);
            List <string>         keywords     = KeywordPredictor.PredictKeywords(taggedTokens);
            KeywordExample        example      = new KeywordExample();

            foreach (string keyword in keywords)
            {
                example.AddKeyword(keyword);
            }
            KeywordClusterer.Load(manipulator, companyId);
            List <int> groups = KeywordClusterer.PredictTopNSimilarGroups(example, 3);

            entryIn.ComplaintGroups = "[" + string.Join(',', groups) + "]";
            List <RepairJobEntry>     potentials     = manipulator.GetDataEntriesByProblemGroup(companyId, problemGroupId);
            List <EntrySimilarity>    ret            = ProblemPredictor.GetQueryResults(entryIn, potentials, numRequested, offset);
            JsonListStringConstructor retConstructor = new JsonListStringConstructor();

            ret.ForEach(obj => retConstructor.AddElement(ConvertEntrySimilarity(obj)));
            return(retConstructor.ToString());


            JsonDictionaryStringConstructor ConvertEntrySimilarity(EntrySimilarity e)
            {
                JsonDictionaryStringConstructor r = new JsonDictionaryStringConstructor();

                r.SetMapping("Make", e.Entry.Make);
                r.SetMapping("Model", e.Entry.Model);
                r.SetMapping("Complaint", e.Entry.Complaint);
                r.SetMapping("Problem", e.Entry.Problem);
                if (e.Entry.Year == -1)
                {
                    r.SetMapping("Year", "Unknown");
                }
                else
                {
                    r.SetMapping("Year", e.Entry.Year);
                }
                r.SetMapping("Id", e.Entry.Id);
                r.SetMapping("Difference", e.Difference);
                return(r);
            }
        }
Beispiel #15
0
        public Dictionary <string, int> GetSentences(string text)
        {
            Dictionary <string, int> SentencesDic = new Dictionary <string, int>();
            int SentCount = 0;

            SentenceTokenizer senTokenizer = new SentenceTokenizer();

            string[] sentences = senTokenizer.Tokenize(text).ToArray();

            foreach (string sentence in sentences)
            {
                if (!SentencesDic.ContainsKey(sentence))
                {
                    SentencesDic.Add(sentence, SentCount++);
                }
            }

            return(SentencesDic);
        }
        private static float CalcSimilarity(RepairJobEntry query, RepairJobEntry other)
        {
            IKeywordPredictor        keyPred = NaiveBayesKeywordPredictor.GetGlobalModel();
            AveragedPerceptronTagger tagger  = AveragedPerceptronTagger.GetTagger();
            List <String>            tokened = SentenceTokenizer.TokenizeSentence(query.Complaint);
            List <List <String> >    tagged  = tagger.Tag(tokened);
            List <String>            InputComplaintKeywords = keyPred.PredictKeywords(tagged);

            tokened = SentenceTokenizer.TokenizeSentence(query.Problem);
            tagged  = tagger.Tag(tokened);
            List <String> InputProblemKeywords = keyPred.PredictKeywords(tagged);
            float         score = 0;

            tokened = SentenceTokenizer.TokenizeSentence(other.Complaint);
            tagged  = tagger.Tag(tokened);
            List <String> JobComplaintKeywords = keyPred.PredictKeywords(tagged);

            tokened = SentenceTokenizer.TokenizeSentence(other.Problem);
            tagged  = tagger.Tag(tokened);
            List <String> JobProblemKeywords = keyPred.PredictKeywords(tagged);

            foreach (String keyword in JobComplaintKeywords)
            {
                if (InputComplaintKeywords.Contains(keyword))
                {
                    score++;
                }
            }
            foreach (String keyword in JobProblemKeywords)
            {
                if (InputProblemKeywords.Contains(keyword))
                {
                    score++;
                }
            }
            return(score / (JobComplaintKeywords.Count + JobProblemKeywords.Count));
        }
        public void VerifyParseForTwoSentences()
        {
            var sentences = SentenceTokenizer.GetSentences("This is a test sentence. In addition there is a second sentence.");

            Assert.AreEqual(sentences.Count, 2);
        }
Beispiel #18
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }