private static void getSites(TweetDataEntities db, params string[] query) { HashSet<string> seenUrls = new HashSet<string>(); var twitter = new TwitterSearch(); int count = 0; foreach (var tweet in twitter.Search(100, 10, query)) { Debug.Print("Tweet number: " + (++count).ToString()); if (db.Tweets.Any(i => i.TweetID == tweet.TweetID)) { continue; } Tweet t = new Tweet() { Text = tweet.Text, TweetID = tweet.TweetID }; Debug.Print("Tweet: " + tweet.Text); Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); foreach (Match m in linkParser.Matches(tweet.Text)) { string fullUrl = ""; try { fullUrl = m.Value.ExpandUrl(); } catch { continue; } if (db.Websites.Any(i => i.Url == fullUrl)) { continue; } Debug.Print("Website: " + fullUrl); var page = new PageScraper(fullUrl); var website = new Website() { Url = page.Url, Title = page.Title() }; db.Websites.AddObject(website); foreach (var m2 in page.Media()) { if (db.Media.Any(i => i.Url == m2.Link)) { continue; } Medium media = new Medium() { Type = m2.Type, Url = m2.Link, SourceSite = website.Url }; if (m2.Type == "image") { var request = WebRequest.Create(m2.Link); using (var response = request.GetResponse()) using (var stream = response.GetResponseStream()) using (var b = Bitmap.FromStream(stream)) { int area = b.Width * b.Height; media.ImageArea = area; } } db.Media.AddObject(media); Debug.Print("Media element: " + m2.Link); } t.LinkSite = website.Url; } db.Tweets.AddObject(t); db.SaveChanges(); } }
static void Main(string[] args) { var twitter = new TwitterSearch(); foreach (var tweet in twitter.Search(100, "data","visualization")) { Console.WriteLine(tweet); Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); foreach (Match m in linkParser.Matches(tweet.Text)) { var fullUrl = m.Value.ExpandUrl(); var scraper = new PageScraper(fullUrl); foreach (var img in scraper.GetImages()) { Debug.Print(img); } } } }
public IEnumerable<string> Fetch(params string[] query) { HashSet<string> seenUrls = new HashSet<string>(); var twitter = new TwitterSearch(); foreach (var tweet in twitter.Search(10, 1, query)) { Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); foreach (Match m in linkParser.Matches(tweet.Text)) { var fullUrl = m.Value.ExpandUrl(); Debug.Print(fullUrl); if (seenUrls.Contains(fullUrl)) { continue; } var scraper = new PageScraper(fullUrl); foreach (var img in scraper.GetImages()) { yield return img; } seenUrls.Add(fullUrl); } } }
private static void addTweet(VisualizationEntities db, DateTime date, string handle, string tweet, string url) { counter++; Debug.Print("Counter: " + counter); HashSet<string> seenUrls = new HashSet<string>(); Debug.Print("Tweet: " + tweet); Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase); string id = url.Split('/').Last(); DB2.Tweet newTweet = new DB2.Tweet() { TweetText = tweet, Date = date, Username = handle.Split(' ').First(), TweetID = id }; if (!db.Tweets.Select(i => i.TweetID).Contains(id)) { db.Tweets.AddObject(newTweet); db.SaveChanges(); } else { return; } foreach (Match m in linkParser.Matches(tweet)) { string fullUrl = ""; string uniqueUrl = ""; try { fullUrl = m.Value.ExpandUrl(); uniqueUrl = fullUrl.UnuiqeUrl(); } catch { continue; } if (fullUrl.Count() > 300 || db.Websites.Any(i => i.Url == fullUrl)) { continue; } Debug.Print("Website: " + fullUrl); string websiteUrl; var seen = db.Websites.Where(i => i.Url == uniqueUrl).SingleOrDefault(); if (seen != null) { seen.HitCount++; websiteUrl = seen.Url; db.SaveChanges(); } else { var page = new PageScraper(fullUrl); var website = new DB2.Website() { Url = page.Url, Title = page.Title() }; db.Websites.AddObject(website); db.SaveChanges(); foreach (var m2 in page.Media()) { if (m2.Link.Count() > 300 || db.Media.Any(i => i.Url == m2.Link)) { continue; } try { DB2.Medium media = new DB2.Medium() { Type = m2.Type, Url = m2.Link, SourceSiteID = website.ID }; if (m2.Type == "image") { var request = WebRequest.Create(m2.Link); using (var response = request.GetResponse()) using (var stream = response.GetResponseStream()) using (var b = System.Drawing.Bitmap.FromStream(stream)) { media.Width = b.Width; media.Height = b.Height; } } db.Media.AddObject(media); db.SaveChanges(); Debug.Print("Media element: " + m2.Link); } catch { } } websiteUrl = website.Url; } } db.SaveChanges(); }
private static void addTweet(VisualizationEntities db, DateTime date, string handle, string tweet, string url) { if (counter++ < Properties.Settings.Default.TweetIndex - 1) { return; } Debug.Print("Tweet counter: " + counter); Debug.Print("Tweet: " + tweet); TweetData tweetData = new TweetData(tweet); string tweetId = url.Split('/').Last(); Tweet newTweet = new DB2.Tweet() { TweetText = tweet, Date = date, Username = handle.Split(' ').First(), TweetID = tweetId }; if (!db.Tweets.Select(i => i.TweetID).Contains(tweetId)) { db.Tweets.AddObject(newTweet); db.SaveChanges(); Properties.Settings.Default.TweetIndex = counter; Properties.Settings.Default.Save(); } else { return; } foreach (var hashtag in tweetData.Hashtags()) { Hashtag extantTag; if (!db.Hashtags.Select(i => i.Tag).Contains(hashtag)) { Debug.Print("new tag: " + hashtag); extantTag = new Hashtag() { Tag = hashtag }; db.Hashtags.AddObject(extantTag); db.SaveChanges(); } else { extantTag = db.Hashtags.Where(i => i.Tag == hashtag).Single(); } db.TweetHashtags.AddObject(new TweetHashtag() { TweetID = newTweet.ID, HashtagID = extantTag.ID }); db.SaveChanges(); } foreach (string matchedUrl in tweetData.Urls()) { string uniqueUrl = ""; try { uniqueUrl = matchedUrl.ExpandUrl().UnuiqeUrl(); } catch (Exception ex) { continue; } ///Find matched websites: var knownSite = db.Websites.Where(i => i.Url == uniqueUrl); if (knownSite.Count() > 0) { /// var tweetWebsite = new TweetWebsite() { TweetID = newTweet.ID, WebsiteID = knownSite.First().ID }; db.TweetWebsites.AddObject(tweetWebsite); db.SaveChanges(); continue; } ///This is a brand new website if (uniqueUrl.Count() > 600 || db.Websites.Any(i => i.Url == uniqueUrl)) { continue; } Debug.Print("Website: " + uniqueUrl); string websiteUrl; try { var page = new PageScraper(uniqueUrl); string longUrlTitle = ""; string description = ""; try { longUrlTitle = page.Url.GetTitle(); } catch { Debug.Print("Failed to get title"); } try { description = page.Url.GetDescription(); } catch { Debug.Print("Failed to get description"); } var website = new DB2.Website() { Url = page.Url, Title = string.Concat((page.Title() ?? "").Take(300)), TweetID = newTweet.ID, LongUrlTitle = string.Concat(longUrlTitle.Take(100)), Description = string.Concat(description.Take(500)) }; db.Websites.AddObject(website); db.SaveChanges(); var tweetWebsite = new TweetWebsite() { TweetID = newTweet.ID, WebsiteID = website.ID }; db.TweetWebsites.AddObject(tweetWebsite); db.SaveChanges(); foreach (var m2 in page.Media()) { if (m2.Link.Count() > 600 || db.Media.Any(i => i.Url == m2.Link)) { continue; } try { DB2.Medium media = new DB2.Medium() { Type = m2.Type, Url = m2.Link, SourceSiteID = website.ID }; if (m2.Type == "image") { var request = WebRequest.Create(m2.Link); using (var response = request.GetResponse()) using (var stream = response.GetResponseStream()) using (var b = System.Drawing.Bitmap.FromStream(stream)) { media.Width = b.Width; media.Height = b.Height; } } db.Media.AddObject(media); db.SaveChanges(); Debug.Print("Media element: " + m2.Link); } catch { break; } } websiteUrl = website.Url; } catch { Debug.Print("Failed to scrape page: " + uniqueUrl); } } db.SaveChanges(); }