private bool skipAll; //used to skip accounts already processed and resume processing after a given account public BoundedCrawler() { twuserData = new d.TwuserData(); tweetData = new d.TweetData(); twuserLinkData = new d.TwuserLinkData(); seedCounter = 0; // skipAll = true; }
public Crawler(string seedAccountId) { accounts = new Queue <string>(); accounts.Enqueue(seedAccountId); loadCookies(); twuserData = new d.TwuserData(); tweetData = new d.TweetData(); tlinkData = new d.TwuserLinkData(); mayEnqueue = true; skipNewAccounts = 1000; }
public void ExtractHashTags() { Hashtable hashtags = new Hashtable(); d.TweetData tData = new d.TweetData(); d.HahstagData hData = new d.HahstagData(); d.TweetHashtagLinkData tweetHashtagData = new d.TweetHashtagLinkData(); d.TwuserHashtagLinkData twuserHashtagData = new d.TwuserHashtagLinkData(); DataTable dt = tData.GetHashTagTweeets(); string hashtagPattern = @"(?<tag>#[^\s]+)"; int counter = 0; int tweetId, tagId; string twuserId, text, nohtmltext, tag; foreach (DataRow dr in dt.Rows) { Console.WriteLine("tweet# " + (++counter)); tweetId = Convert.ToInt32(dr[0]); twuserId = dr[1].ToString(); text = dr[2].ToString(); nohtmltext = dr[3].ToString(); Match matchPattern = Regex.Match(nohtmltext, hashtagPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled); while (matchPattern.Success) { tag = matchPattern.Groups["tag"].ToString().Trim(); tag = normalize(tag); if (tag != "") { if (!hashtags.Contains(tag)) { tagId = hData.Create(tag); hashtags.Add(tag, tagId); } else { tagId = Convert.ToInt32(hashtags[tag]); } tweetHashtagData.Create(tweetId, tagId); twuserHashtagData.Create(twuserId, tagId); } matchPattern = matchPattern.NextMatch(); } } }
public void ExtractText() { d.TweetData tData = new d.TweetData(); DataTable dt = tData.GetRecords(); Regex regex = new Regex("<[^>]*>"); int id; string twuserId, text, nohtmltext; int counter = 0; foreach (DataRow dr in dt.Rows) { id = Convert.ToInt32(dr[0]); twuserId = dr[1].ToString(); text = dr[2].ToString(); nohtmltext = regex.Replace(text, ""); tData.UpdateData(id, nohtmltext); Console.WriteLine(++counter); } }