void IUtf8JsonSerializable.Write(Utf8JsonWriter writer)
 {
     writer.WriteStartObject();
     if (CommonWords.Any())
     {
         writer.WritePropertyName("commonWords");
         writer.WriteStartArray();
         foreach (var item in CommonWords)
         {
             writer.WriteStringValue(item);
         }
         writer.WriteEndArray();
     }
     else
     {
         writer.WriteNull("commonWords");
     }
     if (IgnoreCase != null)
     {
         writer.WritePropertyName("ignoreCase");
         writer.WriteBooleanValue(IgnoreCase.Value);
     }
     if (UseQueryMode != null)
     {
         writer.WritePropertyName("queryMode");
         writer.WriteBooleanValue(UseQueryMode.Value);
     }
     writer.WritePropertyName("@odata.type");
     writer.WriteStringValue(ODataType);
     writer.WritePropertyName("name");
     writer.WriteStringValue(Name);
     writer.WriteEndObject();
 }
Пример #2
0
        public void Process()
        {
            this.network = NetworkUtil.CreateNetwork();
            Console.WriteLine("Preparing training sets...");
            this.common = new CommonWords(Config.FILENAME_COMMON_WORDS);
            this.histogramGood = new WordHistogram(this.common);
            this.histogramBad = new WordHistogram(this.common);

            // load the good words
            this.histogramGood.BuildFromFile(Config.FILENAME_GOOD_TRAINING_TEXT);
            this.histogramGood.BuildComplete();

            // load the bad words
            this.histogramBad.BuildFromFile(Config.FILENAME_BAD_TRAINING_TEXT);
            this.histogramBad.BuildComplete();

            // remove low scoring words
            this.histogramGood
                    .RemoveBelow((int)this.histogramGood.CalculateMean());
            this.histogramBad.RemovePercent(0.99);

            // remove common words
            this.histogramGood.RemoveCommon(this.histogramBad);

            this.histogramGood.Trim(Config.INPUT_SIZE);

            this.goodAnalysis = new AnalyzeSentences(this.histogramGood,
                    Config.INPUT_SIZE);
            this.badAnalysis = new AnalyzeSentences(this.histogramGood,
                    Config.INPUT_SIZE);

            this.goodAnalysis.Process(this.trainingSet, 0.9,
                    Config.FILENAME_GOOD_TRAINING_TEXT);
            this.badAnalysis.Process(this.trainingSet, 0.1,
                    Config.FILENAME_BAD_TRAINING_TEXT);

            this.sampleCount = this.trainingSet.Ideal.Count;
            Console.WriteLine("Processing " + this.sampleCount + " training sets.");

            AllocateTrainingSets();

            CopyTrainingSets();

            TrainNetworkBackpropBackprop();
            SerializeObject.Save(Config.FILENAME_WHENBORN_NET, this.network);
            SerializeObject.Save(Config.FILENAME_HISTOGRAM, this.histogramGood);
            Console.WriteLine("Training complete.");

        }
Пример #3
0
        public void Process()
        {
            this.network = NetworkUtil.CreateNetwork();
            Console.WriteLine("Preparing training sets...");
            this.common        = new CommonWords(Config.FILENAME_COMMON_WORDS);
            this.histogramGood = new WordHistogram(this.common);
            this.histogramBad  = new WordHistogram(this.common);

            // load the good words
            this.histogramGood.BuildFromFile(Config.FILENAME_GOOD_TRAINING_TEXT);
            this.histogramGood.BuildComplete();

            // load the bad words
            this.histogramBad.BuildFromFile(Config.FILENAME_BAD_TRAINING_TEXT);
            this.histogramBad.BuildComplete();

            // remove low scoring words
            this.histogramGood
            .RemoveBelow((int)this.histogramGood.CalculateMean());
            this.histogramBad.RemovePercent(0.99);

            // remove common words
            this.histogramGood.RemoveCommon(this.histogramBad);

            this.histogramGood.Trim(Config.INPUT_SIZE);

            this.goodAnalysis = new AnalyzeSentences(this.histogramGood,
                                                     Config.INPUT_SIZE);
            this.badAnalysis = new AnalyzeSentences(this.histogramGood,
                                                    Config.INPUT_SIZE);

            this.goodAnalysis.Process(this.trainingSet, 0.9,
                                      Config.FILENAME_GOOD_TRAINING_TEXT);
            this.badAnalysis.Process(this.trainingSet, 0.1,
                                     Config.FILENAME_BAD_TRAINING_TEXT);

            this.sampleCount = this.trainingSet.Ideal.Count;
            Console.WriteLine("Processing " + this.sampleCount + " training sets.");

            AllocateTrainingSets();

            CopyTrainingSets();

            TrainNetworkBackpropBackprop();
            SerializeObject.Save(Config.FILENAME_WHENBORN_NET, this.network);
            SerializeObject.Save(Config.FILENAME_HISTOGRAM, this.histogramGood);
            Console.WriteLine("Training complete.");
        }
Пример #4
0
        public List <string> NamedObjectMethod(string text)
        {
            string[]      separators    = { " ", ".", ",", ";", "-", "(", ")", "[", "]", "*", "#", "$", "%", "\"", "?", "!", ":", "\n", "\r" };
            List <string> namedEntities = text
                                          .Split(separators, StringSplitOptions.RemoveEmptyEntries)
                                          .ToList()
                                          .Where(x => CommonWords.GetFrequency(x) < 1)
                                          .Where(a => char.IsUpper(a[0]))
                                          .Where(b => b.Length > 1)
                                          .Where(
                z =>
                !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) ||
                  z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s")))
                                          .ToList();

            return(namedEntities);
        }
Пример #5
0
        string GetTagCloudFromDictionary(Dictionary <string, int> dict)
        {
            StringBuilder sb = new StringBuilder();

            foreach (var item in dict)
            {
                double   fontSize       = ((Math.Min(item.Value, 10) + 5) * 100) / 10;
                string[] separators     = { "n't", "'m", "'ll", "'d", "'ve", "'re", "'s" };
                string   actualWord     = item.Key.Split(separators, StringSplitOptions.None)[0];
                double   weight         = Math.Log(item.Value, 2);
                double   frequencyLog   = Math.Log(CommonWords.GetFrequency(actualWord), 2);
                double   actualFontSize = fontSize * (1.5 - (frequencyLog * 1.0 / 25));
                if (actualFontSize > 0)
                {
                    sb.Append("<span style='font-size:");
                    sb.Append(actualFontSize);
                    sb.Append("%'>");
                    sb.Append(item.Key);
                    sb.Append("</span>&nbsp;");
                }
            }
            return(sb.ToString());
        }
Пример #6
0
 public TrainBot()
 {
     this.common = new CommonWords(Config.FILENAME_COMMON_WORDS);
     this.trainingSet = new TrainingSet();
 }
Пример #7
0
 public WordHistogram(CommonWords common)
 {
     this.common = common;
 }
Пример #8
0
 public TrainBot()
 {
     this.common      = new CommonWords(Config.FILENAME_COMMON_WORDS);
     this.trainingSet = new TrainingSet();
 }
Пример #9
0
        public void GetFrequency()
        {
            int freq = CommonWords.GetFrequency("curious");

            Console.WriteLine(freq);
        }
Пример #10
0
        /*
         * This method sentence-tokenizes all top level comments
         * The best sentences are those where the words in the sentence
         * occur in the most number of subtree items within the current
         * top level comment
         */
        public List <SentenceObj> GetTopSentences(int N)
        {
            List <SentenceObj>          topSentenceObjs      = new List <SentenceObj>();
            List <string>               topSentences         = new List <string>();
            Dictionary <string, double> sentenceScores       = new Dictionary <string, double>();
            Dictionary <string, string> sentenceAuthors      = new Dictionary <string, string>();
            Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>();
            Dictionary <string, int>    sentenceIds          = new Dictionary <string, int>();

            foreach (children child in children)
            {
                try
                {
                    Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child);
                    string        text          = child.text;
                    List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text));
                    string        bestSentence  = currSentences[0];
                    double        currMax       = double.MinValue;
                    foreach (string sentence in currSentences)
                    {
                        string[] allWords     = GetAllWords(sentence);
                        bool     goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2);
                        if (goodSentence)
                        {
                            double weightedScore = 0;
                            int    totalIDCount  = 0;
                            foreach (string word in allWords)
                            {
                                if (!stopWords.Contains(word.ToLower()))
                                {
                                    string stemmedWord = Stemmer.GetStem(word);
                                    if (wordIDMapping.ContainsKey(stemmedWord))
                                    {
                                        HashSet <int> idsContainingWord = wordIDMapping[stemmedWord];
                                        totalIDCount  += idsContainingWord.Count;
                                        weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1);
                                    }
                                }
                            }
                            //add some weighting so that longer sentences have more weight
                            weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length))));
                            double avgScore = weightedScore / allWords.Length;
                            if (avgScore > currMax)
                            {
                                currMax      = avgScore;
                                bestSentence = sentence;
                            }
                        }
                    }
                    sentenceScores[bestSentence]       = currMax;
                    sentenceAuthors[bestSentence]      = child.author;
                    sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child));
                    sentenceIds[bestSentence]          = child.id;
                }
                catch (Exception ex)
                {
                }
            }
            topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList();
            foreach (var sent in topSentences)
            {
                SentenceObj sentenceObj = new SentenceObj()
                {
                    Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id
                };
                topSentenceObjs.Add(sentenceObj);
            }
            topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList();
            return(topSentenceObjs);
        }
Пример #11
0
        public List <string> GetAnchorWords(children root, int N)
        {
            List <string> anchorWords = new List <string>();

            string[] allWords = GetAllWords(root.SubtreeText);
            Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords);
            children rootNode = new children();
            List <HashSet <int> > rootChildIDs = new List <HashSet <int> >();

            foreach (children child in root.Children)
            {
                GetChildIDHashSetList(child);
                HashSet <int> currChildIDs = new HashSet <int>();
                currChildIDs.Add(child.id);
                foreach (var item in child.ChildIDList)
                {
                    currChildIDs.UnionWith(item);
                }
                rootChildIDs.Add(currChildIDs);
            }
            rootNode.ChildIDList = rootChildIDs;
            NodeList             = new List <children>();
            NodeList.Add(rootNode);
            foreach (children child in root.Children)
            {
                PopulateNodeList(child);
            }
            Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping();
            //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>();
            Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >();

            foreach (var kvp in wordIDMapping)
            {
                List <children> currLCAList = new List <children>();
                int             numLCAs     = 0;
                foreach (children node in NodeList)
                {
                    int numBranchesWithWord = 0;
                    foreach (var childIDBranch in node.ChildIDList)
                    {
                        if (childIDBranch.Intersect(kvp.Value).Count() > 0)
                        {
                            numBranchesWithWord += 1;
                        }
                    }
                    if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1)
                    {
                        currLCAList.Add(node);
                    }
                }
                WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList;
            }
            anchorWords = WordLCAList.OrderByDescending(x => x.Value.Count).Select(x => x.Key).Where(y => CommonWords.GetFrequency(y) < 20).Take(N).ToList();
            return(anchorWords);
        }
Пример #12
0
        public Dictionary <string, List <CommentObj> > GetNamedObjects(int N)
        {
            StringBuilder sbAllWords = new StringBuilder();

            foreach (children child in children)
            {
                sbAllWords.Append(child.SubtreeText);
                sbAllWords.Append(" ");
            }
            string[] allWords = GetAllWords(sbAllWords.ToString());
            Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords);
            List <string>         namedObjects = new List <string>();
            children              rootNode     = new children();
            List <HashSet <int> > rootChildIDs = new List <HashSet <int> >();

            foreach (children child in children)
            {
                GetChildIDHashSetList(child);
                HashSet <int> currChildIDs = new HashSet <int>();
                currChildIDs.Add(child.id);
                foreach (var item in child.ChildIDList)
                {
                    currChildIDs.UnionWith(item);
                }
                rootChildIDs.Add(currChildIDs);
            }
            rootNode.ChildIDList = rootChildIDs;
            NodeList             = new List <children>();
            NodeList.Add(rootNode);
            foreach (children child in children)
            {
                PopulateNodeList(child);
            }
            Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping();
            //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>();
            Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >();

            foreach (var kvp in wordIDMapping)
            {
                List <children> currLCAList = new List <children>();
                int             numLCAs     = 0;
                foreach (children node in NodeList)
                {
                    int numBranchesWithWord = 0;
                    foreach (var childIDBranch in node.ChildIDList)
                    {
                        if (childIDBranch.Intersect(kvp.Value).Count() > 0)
                        {
                            numBranchesWithWord += 1;
                        }
                    }
                    if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1)
                    {
                        currLCAList.Add(node);
                    }
                }
                WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList;
            }
            namedObjects = WordLCAList
                           .OrderByDescending(x => x.Value.Count)
                           .Select(x => x.Key)
                           .Where(y => CommonWords.GetFrequency(y) < 1)
                           .Where(a => char.IsUpper(a[0]))
                           .Where(b => b.Length > 1)
                           .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s")))
                           .Take(N)
                           .ToList();
            //namedObjects.Sort();
            Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >();

            foreach (string namedObject in namedObjects)
            {
                List <CommentObj> commentObjsForWord = new List <CommentObj>();
                string            stem        = Stemmer.GetStem(namedObject);
                HashSet <int>     idsWithWord = wordIDMapping[stem];
                foreach (int id in idsWithWord)
                {
                    children   child      = GetNodeById(id);
                    CommentObj commentObj = new CommentObj()
                    {
                        Id = id, Text = child.text
                    };
                    commentObjsForWord.Add(commentObj);
                }
                namedObjectDictionary[namedObject] = commentObjsForWord;
            }
            var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]);

            return(ordered);
        }
Пример #13
0
 public WordHistogram(CommonWords common)
 {
     this.common = common;
 }
Пример #14
0
        public void TestMethod1()
        {
            int freq = CommonWords.GetFrequency("this");

            Console.WriteLine(freq);
        }