Example #1
0
        private void generateTwitterUserFeatures()
        {
            var testSetSeparationDate = Calendar.SelectedDate;

            var writeFeaturesToDB = CheckBoxExtractTextFeatures.IsChecked.Value;
            var textFeaturesOutputTableName = TextBoxTextFeatures.Text;
            var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName);
            var useUsersTable = !CheckBoxDoNotUseUsersTable.IsChecked.Value;

            var sentimentAnalyser = new SentimentAnalyser();

            var spellChecker = new SpellCheckCorrector(new TextBox());
            var outputFolder = TextBoxUserTweetsOutputPath.Text;

            Task.Run(() =>
            {
                var filterer = new Filterer();
                filterer.Filters.Add("HashTags", new HashTagsFilter());
                filterer.Filters.Add("Slang", new SlangCorrector());
                filterer.Filters.Add("Url", new UrlFilter());
                filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter());
                filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter());

                Write("Creating index on twitter_name field");
                tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.Creator._id"));
                //tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.user.id"));TODO: check why it is user but not creator ;)
                Write("Index created");

                var users = new List<string>();

                if (useUsersTable)
                {
                    Write("Scanning users table");

                    users.AddRange(usersTable.FindAll().ToList().ConvertAll(x => x.GetValue("_id").AsString)); //.Reverse();//.ToArray();
                }
                else
                {
                    Write("Requesting all distinct users in database...");

                    users.AddRange(tweetsTable.Distinct("data.Creator._id").ToList().ConvertAll(x =>
                    //users.AddRange(tweetsTable.Distinct("data.user.id").ToList().ConvertAll(x =>
                    {
                        try
                        {

                            return x.AsInt64.ToString();
                        }
                        catch (Exception)
                        {
                            return x.AsInt32.ToString();
                        }
                    }));
                }

                Write("Twitter users list obtained: " + users.Count + " users");

                foreach (var userId in users)
                {
                    Write("User: "******".txt");

                    if (File.Exists(filePath)) continue;

                    #region Features extraction

                    var numberOfHashtags = 0.0;
                    var numberOfSlang = 0.0;
                    var numberOfUrls = 0.0;
                    var numberOfMentions = 0.0;
                    var numberOfRepeatedChars = 0.0;
                    var numberOfEmotionWords = 0.0;
                    var averageSentiLevel = 0.0;
                    var averageSentiScore = 0.0;
                    var numberOfEmoticons = 0.0;
                    var numberOfMisspellings = 0.0;
                    var numberOfMistakes = 0.0;
                    var numberOfRejectedTweets = 0.0;
                    long numberOfTweets = 0;
                    long numberOfTermsTotal = 0;

                    #endregion

                    Write("Requesting all user's tweets...");

                    IEnumerable<BsonDocument> tweets;

                    try
                    {
                        tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId))).ToList();
                    }
                    catch (OutOfMemoryException)
                    {
                        Write("Can't download all user's tweets due to the memory limit. Processing sequentially...");
                        tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId)));
                    }

                    Write("Tweets obtained: " + tweets.Count() + " tweets");

                    var tweetTexts = new ArrayList<string>();

                    foreach (var tweet in tweets)
                    {
                        if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted

                        var text = tweet.GetValue("text").AsString;

                        int totalFiltered;
                        text = filterer.Filter(text, out totalFiltered);

                        int filtered;
                        var spellCheckedTweet = spellChecker.Filter(text, out filtered);
                        var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION
                        var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count();
                        if (errorNumber * 2 > numberOfTerms)
                        {
                            //Write("Tweet : " + text + " considered as not english and skipped");
                            //continue; //Tweet is not english or too bad grammar
                        }
                        else
                        {
                            text = spellCheckedTweet;

                            tweetTexts.Add(text);
                        }

                        #region Features extraction

                        numberOfHashtags += filterer.LastResults["HashTags"];
                        numberOfSlang += filterer.LastResults["Slang"];
                        numberOfUrls += filterer.LastResults["Url"];
                        numberOfMentions += filterer.LastResults["UserMentionsAndPlaceMentions"];
                        numberOfRepeatedChars += filterer.LastResults["RepeatedChars"];

                        int sentiwords;
                        int emoticons;
                        var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons);

                        averageSentiLevel += Math.Abs(sentiscore);
                        averageSentiScore += sentiscore;
                        numberOfEmoticons += emoticons;
                        numberOfEmotionWords += sentiwords;

                        numberOfMisspellings += filtered;
                        numberOfMistakes += errorNumber;
                        numberOfRejectedTweets += errorNumber * 2 > numberOfTerms ? 1 : 0;
                        numberOfTermsTotal += numberOfTerms;

                        #endregion

                        numberOfTweets++;
                    }

                    File.WriteAllLines(filePath, tweetTexts);

                    if (writeFeaturesToDB)
                    {
                        if (numberOfTweets == 0 || numberOfTermsTotal == 0) continue;

                        var textFeaturesDoc = new BsonDocument
                            {
                                {"_id", userId},
                                {"numberOfHashtags", numberOfHashtags/numberOfTweets},
                                {"numberOfSlang", numberOfSlang/numberOfTermsTotal},
                                {"numberOfUrls", numberOfUrls/numberOfTweets},
                                {"numberOfMentions", numberOfMentions/numberOfTweets},
                                {"numberOfRepeatedChars", numberOfRepeatedChars/numberOfTermsTotal},
                                {"numberOfEmotionWords", numberOfEmotionWords/numberOfTermsTotal},
                                {"numberOfEmoticons", numberOfEmoticons/numberOfTermsTotal},
                                {"averageSentiLevel", averageSentiLevel/numberOfTweets},
                                {"averageSentiScore", averageSentiScore/numberOfTweets},
                                {"numberOfMisspellings", numberOfMisspellings/numberOfTermsTotal},
                                {"numberOfMistakes", numberOfMistakes/numberOfTermsTotal},
                                {"numberOfRejectedTweets", numberOfRejectedTweets/numberOfTweets},
                                {"numberOfTermsAverage", (double) numberOfTermsTotal/numberOfTweets},
                                {"numberOfTweets", numberOfTweets}
                            };

                        textFeaturesOutputTable.Insert(textFeaturesDoc);
                    }
                }

                Write("Finished");
            });
        }
Example #2
0
        private void generateTwitterFeaturesForEachMessage()
        {
            var textFeaturesOutputTableName = TextBoxTextFeatures.Text;
            var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName);

            var sentimentAnalyser = new SentimentAnalyser();

            var spellChecker = new SpellCheckCorrector(new TextBox());

            Task.Run(() =>
            {
                var filterer = new Filterer();
                filterer.Filters.Add("HashTags", new HashTagsFilter());
                filterer.Filters.Add("Slang", new SlangCorrector());
                filterer.Filters.Add("Url", new UrlFilter());
                filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter());
                filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter());

                Write("Requesting all user's tweets...");

                try
                {
                    var logMessageCount = 0;
                    //var tweets = tweetsTable.FindAll();

                    //Write("Total number of tweets:" + tweets.Count());

                    foreach (var tweet in tweetsTable.Find(Query.EQ("isRetweet", false)))
                    {
                       // if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted

                        if (textFeaturesOutputTable.FindOne(Query.EQ("_id", tweet["_id"])) != null) continue;

                        var text = tweet.GetValue("text").AsString;

                        int totalFiltered;
                        text = filterer.Filter(text, out totalFiltered);

                        int filtered;
                        spellChecker.Filter(text, out filtered);
                        var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION
                        var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count();

                        bool rejected = false;

                        if (errorNumber * 2 > numberOfTerms)
                        {
                            logMessageCount++;
                            if (logMessageCount % 1000 == 0)
                            {
                                rejected = true;
                                Write(logMessageCount + ": Tweet : " + text + " considered as not english and skipped");
                            }
                        }

                        int sentiwords;
                        int emoticons;
                        var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons);
                    }

                    Write("Finished");
                }
                catch (Exception exc)
                {
                    Write(exc.ToString());
                }
            });
        }