void IUtf8JsonSerializable.Write(Utf8JsonWriter writer) { writer.WriteStartObject(); if (CommonWords.Any()) { writer.WritePropertyName("commonWords"); writer.WriteStartArray(); foreach (var item in CommonWords) { writer.WriteStringValue(item); } writer.WriteEndArray(); } else { writer.WriteNull("commonWords"); } if (IgnoreCase != null) { writer.WritePropertyName("ignoreCase"); writer.WriteBooleanValue(IgnoreCase.Value); } if (UseQueryMode != null) { writer.WritePropertyName("queryMode"); writer.WriteBooleanValue(UseQueryMode.Value); } writer.WritePropertyName("@odata.type"); writer.WriteStringValue(ODataType); writer.WritePropertyName("name"); writer.WriteStringValue(Name); writer.WriteEndObject(); }
public void Process() { this.network = NetworkUtil.CreateNetwork(); Console.WriteLine("Preparing training sets..."); this.common = new CommonWords(Config.FILENAME_COMMON_WORDS); this.histogramGood = new WordHistogram(this.common); this.histogramBad = new WordHistogram(this.common); // load the good words this.histogramGood.BuildFromFile(Config.FILENAME_GOOD_TRAINING_TEXT); this.histogramGood.BuildComplete(); // load the bad words this.histogramBad.BuildFromFile(Config.FILENAME_BAD_TRAINING_TEXT); this.histogramBad.BuildComplete(); // remove low scoring words this.histogramGood .RemoveBelow((int)this.histogramGood.CalculateMean()); this.histogramBad.RemovePercent(0.99); // remove common words this.histogramGood.RemoveCommon(this.histogramBad); this.histogramGood.Trim(Config.INPUT_SIZE); this.goodAnalysis = new AnalyzeSentences(this.histogramGood, Config.INPUT_SIZE); this.badAnalysis = new AnalyzeSentences(this.histogramGood, Config.INPUT_SIZE); this.goodAnalysis.Process(this.trainingSet, 0.9, Config.FILENAME_GOOD_TRAINING_TEXT); this.badAnalysis.Process(this.trainingSet, 0.1, Config.FILENAME_BAD_TRAINING_TEXT); this.sampleCount = this.trainingSet.Ideal.Count; Console.WriteLine("Processing " + this.sampleCount + " training sets."); AllocateTrainingSets(); CopyTrainingSets(); TrainNetworkBackpropBackprop(); SerializeObject.Save(Config.FILENAME_WHENBORN_NET, this.network); SerializeObject.Save(Config.FILENAME_HISTOGRAM, this.histogramGood); Console.WriteLine("Training complete."); }
public List <string> NamedObjectMethod(string text) { string[] separators = { " ", ".", ",", ";", "-", "(", ")", "[", "]", "*", "#", "$", "%", "\"", "?", "!", ":", "\n", "\r" }; List <string> namedEntities = text .Split(separators, StringSplitOptions.RemoveEmptyEntries) .ToList() .Where(x => CommonWords.GetFrequency(x) < 1) .Where(a => char.IsUpper(a[0])) .Where(b => b.Length > 1) .Where( z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s"))) .ToList(); return(namedEntities); }
string GetTagCloudFromDictionary(Dictionary <string, int> dict) { StringBuilder sb = new StringBuilder(); foreach (var item in dict) { double fontSize = ((Math.Min(item.Value, 10) + 5) * 100) / 10; string[] separators = { "n't", "'m", "'ll", "'d", "'ve", "'re", "'s" }; string actualWord = item.Key.Split(separators, StringSplitOptions.None)[0]; double weight = Math.Log(item.Value, 2); double frequencyLog = Math.Log(CommonWords.GetFrequency(actualWord), 2); double actualFontSize = fontSize * (1.5 - (frequencyLog * 1.0 / 25)); if (actualFontSize > 0) { sb.Append("<span style='font-size:"); sb.Append(actualFontSize); sb.Append("%'>"); sb.Append(item.Key); sb.Append("</span> "); } } return(sb.ToString()); }
public TrainBot() { this.common = new CommonWords(Config.FILENAME_COMMON_WORDS); this.trainingSet = new TrainingSet(); }
public WordHistogram(CommonWords common) { this.common = common; }
public void GetFrequency() { int freq = CommonWords.GetFrequency("curious"); Console.WriteLine(freq); }
/* * This method sentence-tokenizes all top level comments * The best sentences are those where the words in the sentence * occur in the most number of subtree items within the current * top level comment */ public List <SentenceObj> GetTopSentences(int N) { List <SentenceObj> topSentenceObjs = new List <SentenceObj>(); List <string> topSentences = new List <string>(); Dictionary <string, double> sentenceScores = new Dictionary <string, double>(); Dictionary <string, string> sentenceAuthors = new Dictionary <string, string>(); Dictionary <string, string> sentenceCommentTrees = new Dictionary <string, string>(); Dictionary <string, int> sentenceIds = new Dictionary <string, int>(); foreach (children child in children) { try { Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(child); string text = child.text; List <string> currSentences = SentenceTokenizer.Tokenize(Util.StripTagsCharArray(text)); string bestSentence = currSentences[0]; double currMax = double.MinValue; foreach (string sentence in currSentences) { string[] allWords = GetAllWords(sentence); bool goodSentence = (allWords.Length > 2) && (stopWords.Where(x => !allWords.Contains(x.ToLower())).Count() > 2); if (goodSentence) { double weightedScore = 0; int totalIDCount = 0; foreach (string word in allWords) { if (!stopWords.Contains(word.ToLower())) { string stemmedWord = Stemmer.GetStem(word); if (wordIDMapping.ContainsKey(stemmedWord)) { HashSet <int> idsContainingWord = wordIDMapping[stemmedWord]; totalIDCount += idsContainingWord.Count; weightedScore += idsContainingWord.Count * 1.0 / (CommonWords.GetFrequency(word) + 1); } } } //add some weighting so that longer sentences have more weight weightedScore = weightedScore * (1 - (1 / (Math.Pow(1.25, allWords.Length)))); double avgScore = weightedScore / allWords.Length; if (avgScore > currMax) { currMax = avgScore; bestSentence = sentence; } } } sentenceScores[bestSentence] = currMax; sentenceAuthors[bestSentence] = child.author; sentenceCommentTrees[bestSentence] = JsonConvert.SerializeObject(GetCommentTreeString(child)); sentenceIds[bestSentence] = child.id; } catch (Exception ex) { } } topSentences = sentenceScores.OrderByDescending(x => x.Value).Take(N).Where(y => !string.IsNullOrWhiteSpace(y.Key)).Select(x => x.Key).ToList(); foreach (var sent in topSentences) { SentenceObj sentenceObj = new SentenceObj() { Author = sentenceAuthors[sent], Sentence = sent, SentenceCommentTree = sentenceCommentTrees[sent], Id = sentenceIds[sent], StoryId = this.id }; topSentenceObjs.Add(sentenceObj); } topSentenceObjs = topSentenceObjs.OrderByDescending(x => GetChildCount(GetNodeById(x.Id))).ToList(); return(topSentenceObjs); }
public List <string> GetAnchorWords(children root, int N) { List <string> anchorWords = new List <string>(); string[] allWords = GetAllWords(root.SubtreeText); Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords); children rootNode = new children(); List <HashSet <int> > rootChildIDs = new List <HashSet <int> >(); foreach (children child in root.Children) { GetChildIDHashSetList(child); HashSet <int> currChildIDs = new HashSet <int>(); currChildIDs.Add(child.id); foreach (var item in child.ChildIDList) { currChildIDs.UnionWith(item); } rootChildIDs.Add(currChildIDs); } rootNode.ChildIDList = rootChildIDs; NodeList = new List <children>(); NodeList.Add(rootNode); foreach (children child in root.Children) { PopulateNodeList(child); } Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(); //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>(); Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >(); foreach (var kvp in wordIDMapping) { List <children> currLCAList = new List <children>(); int numLCAs = 0; foreach (children node in NodeList) { int numBranchesWithWord = 0; foreach (var childIDBranch in node.ChildIDList) { if (childIDBranch.Intersect(kvp.Value).Count() > 0) { numBranchesWithWord += 1; } } if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1) { currLCAList.Add(node); } } WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList; } anchorWords = WordLCAList.OrderByDescending(x => x.Value.Count).Select(x => x.Key).Where(y => CommonWords.GetFrequency(y) < 20).Take(N).ToList(); return(anchorWords); }
public Dictionary <string, List <CommentObj> > GetNamedObjects(int N) { StringBuilder sbAllWords = new StringBuilder(); foreach (children child in children) { sbAllWords.Append(child.SubtreeText); sbAllWords.Append(" "); } string[] allWords = GetAllWords(sbAllWords.ToString()); Dictionary <string, string> stemParentDictionary = GetStemParentDictionary(allWords); List <string> namedObjects = new List <string>(); children rootNode = new children(); List <HashSet <int> > rootChildIDs = new List <HashSet <int> >(); foreach (children child in children) { GetChildIDHashSetList(child); HashSet <int> currChildIDs = new HashSet <int>(); currChildIDs.Add(child.id); foreach (var item in child.ChildIDList) { currChildIDs.UnionWith(item); } rootChildIDs.Add(currChildIDs); } rootNode.ChildIDList = rootChildIDs; NodeList = new List <children>(); NodeList.Add(rootNode); foreach (children child in children) { PopulateNodeList(child); } Dictionary <string, HashSet <int> > wordIDMapping = GetWordIDMapping(); //Dictionary<string, double> WordTreeScore = new Dictionary<string, double>(); Dictionary <string, List <children> > WordLCAList = new Dictionary <string, List <children> >(); foreach (var kvp in wordIDMapping) { List <children> currLCAList = new List <children>(); int numLCAs = 0; foreach (children node in NodeList) { int numBranchesWithWord = 0; foreach (var childIDBranch in node.ChildIDList) { if (childIDBranch.Intersect(kvp.Value).Count() > 0) { numBranchesWithWord += 1; } } if ((numBranchesWithWord == 1 && node.ChildIDList.Count == 1) || numBranchesWithWord > 1) { currLCAList.Add(node); } } WordLCAList[stemParentDictionary.ContainsKey(kvp.Key) ? stemParentDictionary[kvp.Key] : kvp.Key] = currLCAList; } namedObjects = WordLCAList .OrderByDescending(x => x.Value.Count) .Select(x => x.Key) .Where(y => CommonWords.GetFrequency(y) < 1) .Where(a => char.IsUpper(a[0])) .Where(b => b.Length > 1) .Where(z => !(z.EndsWith("n't") || z.EndsWith("'m") || (z.EndsWith("'ll")) || (z.EndsWith("'d")) || z.EndsWith("'ve") || z.EndsWith("'re") || z.EndsWith("'s"))) .Take(N) .ToList(); //namedObjects.Sort(); Dictionary <string, List <CommentObj> > namedObjectDictionary = new Dictionary <string, List <CommentObj> >(); foreach (string namedObject in namedObjects) { List <CommentObj> commentObjsForWord = new List <CommentObj>(); string stem = Stemmer.GetStem(namedObject); HashSet <int> idsWithWord = wordIDMapping[stem]; foreach (int id in idsWithWord) { children child = GetNodeById(id); CommentObj commentObj = new CommentObj() { Id = id, Text = child.text }; commentObjsForWord.Add(commentObj); } namedObjectDictionary[namedObject] = commentObjsForWord; } var ordered = namedObjectDictionary.Keys.OrderByDescending(x => namedObjectDictionary[x].Count).ToList().ToDictionary(x => x, x => namedObjectDictionary[x]); return(ordered); }
public void TestMethod1() { int freq = CommonWords.GetFrequency("this"); Console.WriteLine(freq); }