/// <summary> /// Loads the initial corpus data file, which contains Tweet Ids. /// </summary> /// <param name="pathName">File path to the file to open.</param> /// <returns>List of CorpusDataRow</returns> private static List <CorpusDataRow> LoadCorpus(string pathName) { List <CorpusDataRow> corpus = new List <CorpusDataRow>(); using (FileStream f = new FileStream(pathName, FileMode.Open)) { using (StreamReader streamReader = new StreamReader(f)) { using (CsvReader csvReader = new CsvReader(streamReader)) { csvReader.Configuration.HasHeaderRecord = false; while (csvReader.Read()) { CorpusDataRow row = new CorpusDataRow(); int columnIndex = 0; row.Keyword = csvReader.GetField(columnIndex++); // Convert the first letter to uppercase. string sentiment = csvReader.GetField(columnIndex++); sentiment = sentiment.First().ToString().ToUpper() + String.Join("", sentiment.Skip(1)); row.Sentiment = (Sentiment)Enum.Parse(typeof(Sentiment), sentiment); row.Id = Int64.Parse(csvReader.GetField(columnIndex++)); corpus.Add(row); } } } } return(corpus); }
/// <summary> /// Save a CorpusDataRow object to the output file. /// </summary> /// <param name="row">CorpusDataRow (with Tweet DTO populated).</param> /// <param name="pathName">File path to output file to append to.</param> private static void SaveResult(CorpusDataRow row, string pathName) { using (FileStream f = new FileStream(pathName, FileMode.Append)) { using (StreamWriter streamWriter = new StreamWriter(f)) { using (CsvWriter csvWriter = new CsvWriter(streamWriter)) { csvWriter.WriteRecord <CorpusDataRow>(row); } } } }
private static List <CorpusDataRow> SearchTweets(string keyword, Sentiment sentiment, int count, TwitterService service, string outputPath) { List <CorpusDataRow> outputCorpus = new List <CorpusDataRow>(); long?lastId = null; int index = 0; int skipCount = 0; keyword += sentiment == Sentiment.Positive ? " :)" : " :("; while (skipCount == 0 && outputCorpus.Count < count) { // Fetch the tweet. var statusList = service.Search(new SearchOptions() { Q = keyword, Lang = "en", IncludeEntities = false, Count = count, MaxId = lastId }); lastId = statusList.Statuses.Last().Id; foreach (var status in statusList.Statuses) { if (!status.Text.StartsWith("RT") && !status.Text.Contains(":P") && !((status.Text.Contains(":)") || status.Text.Contains(":-)") || status.Text.Contains(": )") || status.Text.Contains(":D") || status.Text.Contains("=)")) && (status.Text.Contains(":(") || status.Text.Contains(":-(") || status.Text.Contains(": ("))) && outputCorpus.Where(c => c.Tweet.Text == status.Text).Count() == 0) { status.Text = status.Text.Replace(",", " "); status.Text = status.Text.Replace("\n", " "); status.Text = status.Text.Replace("\r", " "); status.Text = status.Text.Replace("\t", " "); status.Text = status.Text.Replace(":)", " "); status.Text = status.Text.Replace(":-)", " "); status.Text = status.Text.Replace(": )", " "); status.Text = status.Text.Replace(":D", " "); status.Text = status.Text.Replace("=)", " "); status.Text = status.Text.Replace(":(", " "); status.Text = status.Text.Replace(":-(", " "); status.Text = status.Text.Replace(": (", " "); if (service.Response.StatusCode == System.Net.HttpStatusCode.OK) { // Convert the TwitterStatus to a Tweet DTO. CorpusDataRow row = new CorpusDataRow(); row.Id = status.Id; row.Keyword = keyword; row.Tweet = Mapper.Map <TwitterStatus, Tweet>(status); row.Sentiment = sentiment; // Save the result to file. SaveResult(row, outputPath); outputCorpus.Add(row); if ((index + 1) % 50 == 0) { Console.WriteLine("Processed " + (index + 1) + " tweets."); } } else { // Check the rate limit. TwitterRateLimitStatus rateSearch = service.Response.RateLimitStatus; if (rateSearch.RemainingHits < 1) { DateTime resetTime = rateSearch.ResetTime + TimeSpan.FromMinutes(1); Console.WriteLine("Rate Limit reached. Sleeping until " + resetTime); Thread.Sleep(resetTime - DateTime.Now); // Try this record again. index--; } else { // Some other error. Maybe 404. Skip this record. skipCount++; Console.WriteLine("Skipped " + skipCount + " records. Got " + service.Response.StatusCode + "."); } } } } } Console.WriteLine("Saved " + outputCorpus.Count + ", Skipped " + skipCount + "."); return(outputCorpus); }
/// <summary> /// Loads the tweet text data for each id in the corpus. /// </summary> /// <param name="service">TwitterService</param> /// <param name="corpus">List of CorpusDataRow</param> /// <param name="outputPath">File path to output data file.</param> /// <returns>List of CorpusDataRow (with Tweet DTO populated).</returns> private static List <CorpusDataRow> LoadTweets(TwitterService service, List <CorpusDataRow> corpus, string outputPath) { List <CorpusDataRow> outputCorpus = new List <CorpusDataRow>(); int skipCount = 0; for (int index = GetResumeIndex(corpus, outputPath); index < corpus.Count; index++) { CorpusDataRow row = corpus[index]; // Fetch the tweet. var status = service.GetTweet(new GetTweetOptions() { Id = row.Id }); status.Text = status.Text.Replace(",", " "); status.Text = status.Text.Replace("\n", " "); status.Text = status.Text.Replace("\r", " "); status.Text = status.Text.Replace("\t", " "); if (service.Response.StatusCode == System.Net.HttpStatusCode.OK) { // Convert the TwitterStatus to a Tweet DTO. row.Tweet = Mapper.Map <TwitterStatus, Tweet>(status); // Save the result to file. SaveResult(row, outputPath); outputCorpus.Add(row); if ((index + 1) % 50 == 0) { Console.WriteLine("Processed " + (index + 1) + " tweets."); } } else { // Check the rate limit. TwitterRateLimitStatus rateSearch = service.Response.RateLimitStatus; if (rateSearch.RemainingHits < 1) { DateTime resetTime = rateSearch.ResetTime + TimeSpan.FromMinutes(1); Console.WriteLine("Rate Limit reached. Sleeping until " + resetTime); Thread.Sleep(resetTime - DateTime.Now); // Try this record again. index--; } else { // Some other error. Maybe 404. Skip this record. skipCount++; Console.WriteLine("Skipped " + skipCount + " records. Got " + service.Response.StatusCode + "."); } } } Console.WriteLine("Saved " + outputCorpus.Count + ", Skipped " + skipCount + "."); return(outputCorpus); }