private void generateTwitterUserFeatures() { var testSetSeparationDate = Calendar.SelectedDate; var writeFeaturesToDB = CheckBoxExtractTextFeatures.IsChecked.Value; var textFeaturesOutputTableName = TextBoxTextFeatures.Text; var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName); var useUsersTable = !CheckBoxDoNotUseUsersTable.IsChecked.Value; var sentimentAnalyser = new SentimentAnalyser(); var spellChecker = new SpellCheckCorrector(new TextBox()); var outputFolder = TextBoxUserTweetsOutputPath.Text; Task.Run(() => { var filterer = new Filterer(); filterer.Filters.Add("HashTags", new HashTagsFilter()); filterer.Filters.Add("Slang", new SlangCorrector()); filterer.Filters.Add("Url", new UrlFilter()); filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter()); filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter()); Write("Creating index on twitter_name field"); tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.Creator._id")); //tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.user.id"));TODO: check why it is user but not creator ;) Write("Index created"); var users = new List<string>(); if (useUsersTable) { Write("Scanning users table"); users.AddRange(usersTable.FindAll().ToList().ConvertAll(x => x.GetValue("_id").AsString)); //.Reverse();//.ToArray(); } else { Write("Requesting all distinct users in database..."); users.AddRange(tweetsTable.Distinct("data.Creator._id").ToList().ConvertAll(x => //users.AddRange(tweetsTable.Distinct("data.user.id").ToList().ConvertAll(x => { try { return x.AsInt64.ToString(); } catch (Exception) { return x.AsInt32.ToString(); } })); } Write("Twitter users list obtained: " + users.Count + " users"); foreach (var userId in users) { Write("User: "******".txt"); if (File.Exists(filePath)) continue; #region Features extraction var numberOfHashtags = 0.0; var numberOfSlang = 0.0; var numberOfUrls = 0.0; var numberOfMentions = 0.0; var numberOfRepeatedChars = 0.0; var numberOfEmotionWords = 0.0; var averageSentiLevel = 0.0; var averageSentiScore = 0.0; var numberOfEmoticons = 0.0; var numberOfMisspellings = 0.0; var numberOfMistakes = 0.0; var numberOfRejectedTweets = 0.0; long numberOfTweets = 0; long numberOfTermsTotal = 0; #endregion Write("Requesting all user's tweets..."); IEnumerable<BsonDocument> tweets; try { tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId))).ToList(); } catch (OutOfMemoryException) { Write("Can't download all user's tweets due to the memory limit. Processing sequentially..."); tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId))); } Write("Tweets obtained: " + tweets.Count() + " tweets"); var tweetTexts = new ArrayList<string>(); foreach (var tweet in tweets) { if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted var text = tweet.GetValue("text").AsString; int totalFiltered; text = filterer.Filter(text, out totalFiltered); int filtered; var spellCheckedTweet = spellChecker.Filter(text, out filtered); var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count(); if (errorNumber * 2 > numberOfTerms) { //Write("Tweet : " + text + " considered as not english and skipped"); //continue; //Tweet is not english or too bad grammar } else { text = spellCheckedTweet; tweetTexts.Add(text); } #region Features extraction numberOfHashtags += filterer.LastResults["HashTags"]; numberOfSlang += filterer.LastResults["Slang"]; numberOfUrls += filterer.LastResults["Url"]; numberOfMentions += filterer.LastResults["UserMentionsAndPlaceMentions"]; numberOfRepeatedChars += filterer.LastResults["RepeatedChars"]; int sentiwords; int emoticons; var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons); averageSentiLevel += Math.Abs(sentiscore); averageSentiScore += sentiscore; numberOfEmoticons += emoticons; numberOfEmotionWords += sentiwords; numberOfMisspellings += filtered; numberOfMistakes += errorNumber; numberOfRejectedTweets += errorNumber * 2 > numberOfTerms ? 1 : 0; numberOfTermsTotal += numberOfTerms; #endregion numberOfTweets++; } File.WriteAllLines(filePath, tweetTexts); if (writeFeaturesToDB) { if (numberOfTweets == 0 || numberOfTermsTotal == 0) continue; var textFeaturesDoc = new BsonDocument { {"_id", userId}, {"numberOfHashtags", numberOfHashtags/numberOfTweets}, {"numberOfSlang", numberOfSlang/numberOfTermsTotal}, {"numberOfUrls", numberOfUrls/numberOfTweets}, {"numberOfMentions", numberOfMentions/numberOfTweets}, {"numberOfRepeatedChars", numberOfRepeatedChars/numberOfTermsTotal}, {"numberOfEmotionWords", numberOfEmotionWords/numberOfTermsTotal}, {"numberOfEmoticons", numberOfEmoticons/numberOfTermsTotal}, {"averageSentiLevel", averageSentiLevel/numberOfTweets}, {"averageSentiScore", averageSentiScore/numberOfTweets}, {"numberOfMisspellings", numberOfMisspellings/numberOfTermsTotal}, {"numberOfMistakes", numberOfMistakes/numberOfTermsTotal}, {"numberOfRejectedTweets", numberOfRejectedTweets/numberOfTweets}, {"numberOfTermsAverage", (double) numberOfTermsTotal/numberOfTweets}, {"numberOfTweets", numberOfTweets} }; textFeaturesOutputTable.Insert(textFeaturesDoc); } } Write("Finished"); }); }
private void generateTwitterFeaturesForEachMessage() { var textFeaturesOutputTableName = TextBoxTextFeatures.Text; var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName); var sentimentAnalyser = new SentimentAnalyser(); var spellChecker = new SpellCheckCorrector(new TextBox()); Task.Run(() => { var filterer = new Filterer(); filterer.Filters.Add("HashTags", new HashTagsFilter()); filterer.Filters.Add("Slang", new SlangCorrector()); filterer.Filters.Add("Url", new UrlFilter()); filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter()); filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter()); Write("Requesting all user's tweets..."); try { var logMessageCount = 0; //var tweets = tweetsTable.FindAll(); //Write("Total number of tweets:" + tweets.Count()); foreach (var tweet in tweetsTable.Find(Query.EQ("isRetweet", false))) { // if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted if (textFeaturesOutputTable.FindOne(Query.EQ("_id", tweet["_id"])) != null) continue; var text = tweet.GetValue("text").AsString; int totalFiltered; text = filterer.Filter(text, out totalFiltered); int filtered; spellChecker.Filter(text, out filtered); var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count(); bool rejected = false; if (errorNumber * 2 > numberOfTerms) { logMessageCount++; if (logMessageCount % 1000 == 0) { rejected = true; Write(logMessageCount + ": Tweet : " + text + " considered as not english and skipped"); } } int sentiwords; int emoticons; var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons); } Write("Finished"); } catch (Exception exc) { Write(exc.ToString()); } }); }