public void Test_Predicates_Before_1() { var input = new Slot[] { TestData.GetSlot(9) }; Slot[] result = Filterer.Filter(input, Predicates_Before); Assert.True(result.Length == 1); }
public void Test_Predicates_AllDay_2() { var input = new Slot[] { TestData.GetSlot(281), TestData.GetSlot(289) }; Slot[] result = Filterer.Filter(input, Predicates_AllDay); Assert.True(result == null); }
public void Test_Predicates_AllDay_3() { var input = TestData.GetSlotRange(281, 307).ToArray(); Slot[] result = Filterer.Filter(input, Predicates_AllDay); Assert.True(result.Length == 11); }
public void FilterWhenExpressionThrows() { FilterRule filterRule = new FilterRule() { Name = "Rule1", Expression = "source['a'] == 3" , IsEnabled = true }; var expressionEvaluator = new TestExpressionEvaluator(); expressionEvaluator.Evaluations["source['a'] == 3"] = () => throw new Exception("Some error"); var headerList = CsvParser.GetParts("a,b,c"); var row1 = CsvParser.Parse("1,2,3", headerList); Filterer filterer = new Filterer(new List <FilterRule>() { filterRule }, expressionEvaluator); var rowCtx = new SourceContext() { Source = row1 }; filterer.Filter(rowCtx); Assert.IsFalse(rowCtx.IsDeleted); Assert.AreEqual("Some error", rowCtx.SourceRowErrors[0].Exception.Message); }
public void Test_Predicates_AllDay_1() { var input = TestData.GetSlotRange(309, 315).ToArray(); Slot[] result = Filterer.Filter(input, Predicates_AllDay); Assert.True(result == null); }
public void FilterWhenFilterReturnsFalse() { FilterRule filterRule = new FilterRule() { Name = "Rule1", Expression = "source['a'] == 3" , IsEnabled = true }; var expressionEvaluator = new TestExpressionEvaluator(); expressionEvaluator.Evaluations["source['a'] == 3"] = () => false; var headerList = CsvParser.GetParts("a,b,c"); var row1 = CsvParser.Parse("1,2,3", headerList); Filterer filterer = new Filterer(new List <FilterRule>() { filterRule }, expressionEvaluator); var rowCtx = new SourceContext() { Source = row1 }; filterer.Filter(rowCtx); Assert.IsTrue(rowCtx.IsDeleted); }
private List <List <Slot> > RunPermutation(Slot[] input) { var filteredSlot = Filterer.Filter(input, _predicates); var bg = CustomBackgroundWorker <Slot[], List <List <Slot> > > . RunAndShowLoadingScreen(_permutator, filteredSlot, "Finding possible combination . . . "); return(bg.GetResult()); }
/// <summary> /// The default filtering implementation uses the injected <see cref="Filterer"/> object to filter the given datasource with the provided filter. /// </summary> /// <param name="source">The datasource to apply filtering to.</param> /// <param name="filter">The filter to apply to the datasource.</param> /// <returns>The filtered dataset.</returns> public override IQueryable <TSource> Filter(IQueryable <TSource> source, TFilter filter) => Filterer.Filter(source, filter);
private void generateTwitterUserFeatures() { var testSetSeparationDate = Calendar.SelectedDate; var writeFeaturesToDB = CheckBoxExtractTextFeatures.IsChecked.Value; var textFeaturesOutputTableName = TextBoxTextFeatures.Text; var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName); var useUsersTable = !CheckBoxDoNotUseUsersTable.IsChecked.Value; var sentimentAnalyser = new SentimentAnalyser(); var spellChecker = new SpellCheckCorrector(new TextBox()); var outputFolder = TextBoxUserTweetsOutputPath.Text; Task.Run(() => { var filterer = new Filterer(); filterer.Filters.Add("HashTags", new HashTagsFilter()); filterer.Filters.Add("Slang", new SlangCorrector()); filterer.Filters.Add("Url", new UrlFilter()); filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter()); filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter()); Write("Creating index on twitter_name field"); tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.Creator._id")); //tweetsTable.CreateIndex(new IndexKeysBuilder().Ascending("data.user.id"));TODO: check why it is user but not creator ;) Write("Index created"); var users = new List<string>(); if (useUsersTable) { Write("Scanning users table"); users.AddRange(usersTable.FindAll().ToList().ConvertAll(x => x.GetValue("_id").AsString)); //.Reverse();//.ToArray(); } else { Write("Requesting all distinct users in database..."); users.AddRange(tweetsTable.Distinct("data.Creator._id").ToList().ConvertAll(x => //users.AddRange(tweetsTable.Distinct("data.user.id").ToList().ConvertAll(x => { try { return x.AsInt64.ToString(); } catch (Exception) { return x.AsInt32.ToString(); } })); } Write("Twitter users list obtained: " + users.Count + " users"); foreach (var userId in users) { Write("User: "******".txt"); if (File.Exists(filePath)) continue; #region Features extraction var numberOfHashtags = 0.0; var numberOfSlang = 0.0; var numberOfUrls = 0.0; var numberOfMentions = 0.0; var numberOfRepeatedChars = 0.0; var numberOfEmotionWords = 0.0; var averageSentiLevel = 0.0; var averageSentiScore = 0.0; var numberOfEmoticons = 0.0; var numberOfMisspellings = 0.0; var numberOfMistakes = 0.0; var numberOfRejectedTweets = 0.0; long numberOfTweets = 0; long numberOfTermsTotal = 0; #endregion Write("Requesting all user's tweets..."); IEnumerable<BsonDocument> tweets; try { tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId))).ToList(); } catch (OutOfMemoryException) { Write("Can't download all user's tweets due to the memory limit. Processing sequentially..."); tweets = tweetsTable.Find(Query.EQ("data.Creator._id", long.Parse(userId))); } Write("Tweets obtained: " + tweets.Count() + " tweets"); var tweetTexts = new ArrayList<string>(); foreach (var tweet in tweets) { if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted var text = tweet.GetValue("text").AsString; int totalFiltered; text = filterer.Filter(text, out totalFiltered); int filtered; var spellCheckedTweet = spellChecker.Filter(text, out filtered); var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count(); if (errorNumber * 2 > numberOfTerms) { //Write("Tweet : " + text + " considered as not english and skipped"); //continue; //Tweet is not english or too bad grammar } else { text = spellCheckedTweet; tweetTexts.Add(text); } #region Features extraction numberOfHashtags += filterer.LastResults["HashTags"]; numberOfSlang += filterer.LastResults["Slang"]; numberOfUrls += filterer.LastResults["Url"]; numberOfMentions += filterer.LastResults["UserMentionsAndPlaceMentions"]; numberOfRepeatedChars += filterer.LastResults["RepeatedChars"]; int sentiwords; int emoticons; var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons); averageSentiLevel += Math.Abs(sentiscore); averageSentiScore += sentiscore; numberOfEmoticons += emoticons; numberOfEmotionWords += sentiwords; numberOfMisspellings += filtered; numberOfMistakes += errorNumber; numberOfRejectedTweets += errorNumber * 2 > numberOfTerms ? 1 : 0; numberOfTermsTotal += numberOfTerms; #endregion numberOfTweets++; } File.WriteAllLines(filePath, tweetTexts); if (writeFeaturesToDB) { if (numberOfTweets == 0 || numberOfTermsTotal == 0) continue; var textFeaturesDoc = new BsonDocument { {"_id", userId}, {"numberOfHashtags", numberOfHashtags/numberOfTweets}, {"numberOfSlang", numberOfSlang/numberOfTermsTotal}, {"numberOfUrls", numberOfUrls/numberOfTweets}, {"numberOfMentions", numberOfMentions/numberOfTweets}, {"numberOfRepeatedChars", numberOfRepeatedChars/numberOfTermsTotal}, {"numberOfEmotionWords", numberOfEmotionWords/numberOfTermsTotal}, {"numberOfEmoticons", numberOfEmoticons/numberOfTermsTotal}, {"averageSentiLevel", averageSentiLevel/numberOfTweets}, {"averageSentiScore", averageSentiScore/numberOfTweets}, {"numberOfMisspellings", numberOfMisspellings/numberOfTermsTotal}, {"numberOfMistakes", numberOfMistakes/numberOfTermsTotal}, {"numberOfRejectedTweets", numberOfRejectedTweets/numberOfTweets}, {"numberOfTermsAverage", (double) numberOfTermsTotal/numberOfTweets}, {"numberOfTweets", numberOfTweets} }; textFeaturesOutputTable.Insert(textFeaturesDoc); } } Write("Finished"); }); }
private void generateTwitterFeaturesForEachMessage() { var textFeaturesOutputTableName = TextBoxTextFeatures.Text; var textFeaturesOutputTable = database.GetCollection<BsonDocument>(textFeaturesOutputTableName); var sentimentAnalyser = new SentimentAnalyser(); var spellChecker = new SpellCheckCorrector(new TextBox()); Task.Run(() => { var filterer = new Filterer(); filterer.Filters.Add("HashTags", new HashTagsFilter()); filterer.Filters.Add("Slang", new SlangCorrector()); filterer.Filters.Add("Url", new UrlFilter()); filterer.Filters.Add("UserMentionsAndPlaceMentions", new UserMentionsAndPlaceMentionsFilter()); filterer.Filters.Add("RepeatedChars", new RepeatedCharsFilter()); Write("Requesting all user's tweets..."); try { var logMessageCount = 0; //var tweets = tweetsTable.FindAll(); //Write("Total number of tweets:" + tweets.Count()); foreach (var tweet in tweetsTable.Find(Query.EQ("isRetweet", false))) { // if (tweet.GetElement("isRetweet").Value.AsBoolean) continue; //Tweet is retweeted if (textFeaturesOutputTable.FindOne(Query.EQ("_id", tweet["_id"])) != null) continue; var text = tweet.GetValue("text").AsString; int totalFiltered; text = filterer.Filter(text, out totalFiltered); int filtered; spellChecker.Filter(text, out filtered); var errorNumber = spellChecker.LastNumberOfErrors; //IMPORTANT FEATURE FOR AGE PREDICTION var numberOfTerms = text.Split(new[] { " " }, StringSplitOptions.RemoveEmptyEntries).Count(); bool rejected = false; if (errorNumber * 2 > numberOfTerms) { logMessageCount++; if (logMessageCount % 1000 == 0) { rejected = true; Write(logMessageCount + ": Tweet : " + text + " considered as not english and skipped"); } } int sentiwords; int emoticons; var sentiscore = sentimentAnalyser.GetSentiments(text, out sentiwords, out emoticons); } Write("Finished"); } catch (Exception exc) { Write(exc.ToString()); } }); }