Example #1
0
        private static void getSites(TweetDataEntities db, params string[] query)
        {
            HashSet<string> seenUrls = new HashSet<string>();
            var twitter = new TwitterSearch();
            int count = 0;
            foreach (var tweet in twitter.Search(100, 10, query)) {
                Debug.Print("Tweet number: " + (++count).ToString());
                if (db.Tweets.Any(i => i.TweetID == tweet.TweetID)) {
                    continue;
                }
                Tweet t = new Tweet() { Text = tweet.Text, TweetID = tweet.TweetID };
                Debug.Print("Tweet: " + tweet.Text);
                Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
                foreach (Match m in linkParser.Matches(tweet.Text)) {
                    string fullUrl = "";
                    try {
                        fullUrl = m.Value.ExpandUrl();
                    } catch {
                        continue;
                    }
                    if (db.Websites.Any(i => i.Url == fullUrl)) {
                        continue;
                    }
                    Debug.Print("Website: " + fullUrl);
                    var page = new PageScraper(fullUrl);
                    var website = new Website() { Url = page.Url, Title = page.Title() };
                    db.Websites.AddObject(website);
                    foreach (var m2 in page.Media()) {
                        if (db.Media.Any(i => i.Url == m2.Link)) {
                            continue;
                        }
                        Medium media = new Medium() { Type = m2.Type, Url = m2.Link, SourceSite = website.Url };
                        if (m2.Type == "image") {
                            var request = WebRequest.Create(m2.Link);

                            using (var response = request.GetResponse())
                            using (var stream = response.GetResponseStream())
                            using (var b = Bitmap.FromStream(stream)) {
                                int area = b.Width * b.Height;
                                media.ImageArea = area;
                            }
                        }
                        db.Media.AddObject(media);
                        Debug.Print("Media element: " + m2.Link);
                    }
                    t.LinkSite = website.Url;

                }
                db.Tweets.AddObject(t);
                db.SaveChanges();
            }
        }
Example #2
0
 static void Main(string[] args)
 {
     var twitter = new TwitterSearch();
     foreach (var tweet in twitter.Search(100, "data","visualization")) {
         Console.WriteLine(tweet);
         Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
         foreach (Match m in linkParser.Matches(tweet.Text)) {
             var fullUrl = m.Value.ExpandUrl();
             var scraper = new PageScraper(fullUrl);
             foreach (var img in scraper.GetImages()) {
                 Debug.Print(img);
             }
         }
     }
 }
Example #3
0
 public IEnumerable<string> Fetch(params string[] query)
 {
     HashSet<string> seenUrls = new HashSet<string>();
     var twitter = new TwitterSearch();
     foreach (var tweet in twitter.Search(10, 1, query)) {
         Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
         foreach (Match m in linkParser.Matches(tweet.Text)) {
             var fullUrl = m.Value.ExpandUrl();
             Debug.Print(fullUrl);
             if (seenUrls.Contains(fullUrl)) {
                 continue;
             }
             var scraper = new PageScraper(fullUrl);
             foreach (var img in scraper.GetImages()) {
                 yield return img;
             }
             seenUrls.Add(fullUrl);
         }
     }
 }
Example #4
0
        private static void addTweet(VisualizationEntities db, DateTime date, string handle, string tweet, string url)
        {
            counter++;
            Debug.Print("Counter: " + counter);
            HashSet<string> seenUrls = new HashSet<string>();
            Debug.Print("Tweet: " + tweet);
            Regex linkParser = new Regex(@"\b(?:http://|www\.)\S+\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            string id = url.Split('/').Last();
            DB2.Tweet newTweet = new DB2.Tweet() { TweetText = tweet, Date = date, Username = handle.Split(' ').First(), TweetID = id };
            if (!db.Tweets.Select(i => i.TweetID).Contains(id)) {
                db.Tweets.AddObject(newTweet);
                db.SaveChanges();
            } else {
                return;
            }
            foreach (Match m in linkParser.Matches(tweet)) {
                string fullUrl = "";
                string uniqueUrl = "";
                try {
                    fullUrl = m.Value.ExpandUrl();
                    uniqueUrl = fullUrl.UnuiqeUrl();
                } catch {
                    continue;
                }
                if (fullUrl.Count() > 300 || db.Websites.Any(i => i.Url == fullUrl)) {
                    continue;
                }
                Debug.Print("Website: " + fullUrl);
                string websiteUrl;
                var seen = db.Websites.Where(i => i.Url == uniqueUrl).SingleOrDefault();
                if (seen != null) {
                    seen.HitCount++;
                    websiteUrl = seen.Url;
                    db.SaveChanges();
                } else {
                    var page = new PageScraper(fullUrl);
                    var website = new DB2.Website() { Url = page.Url, Title = page.Title() };
                    db.Websites.AddObject(website);
                    db.SaveChanges();
                    foreach (var m2 in page.Media()) {
                        if (m2.Link.Count() > 300 || db.Media.Any(i => i.Url == m2.Link)) {
                            continue;
                        }
                        try {
                            DB2.Medium media = new DB2.Medium() { Type = m2.Type, Url = m2.Link, SourceSiteID = website.ID };
                            if (m2.Type == "image") {
                                var request = WebRequest.Create(m2.Link);

                                using (var response = request.GetResponse())
                                using (var stream = response.GetResponseStream())
                                using (var b = System.Drawing.Bitmap.FromStream(stream)) {
                                    media.Width = b.Width;
                                    media.Height = b.Height;
                                }
                            }
                            db.Media.AddObject(media);
                            db.SaveChanges();
                            Debug.Print("Media element: " + m2.Link);
                        } catch {

                        }
                    }
                    websiteUrl = website.Url;
                }
            }
            db.SaveChanges();
        }
Example #5
0
        private static void addTweet(VisualizationEntities db, DateTime date, string handle, string tweet, string url)
        {
            if (counter++ < Properties.Settings.Default.TweetIndex - 1) {
                return;
            }
            Debug.Print("Tweet counter: " + counter);
            Debug.Print("Tweet: " + tweet);
            TweetData tweetData = new TweetData(tweet);
            string tweetId = url.Split('/').Last();
            Tweet newTweet = new DB2.Tweet() { TweetText = tweet, Date = date, Username = handle.Split(' ').First(), TweetID = tweetId };
            if (!db.Tweets.Select(i => i.TweetID).Contains(tweetId)) {
                db.Tweets.AddObject(newTweet);
                db.SaveChanges();
                Properties.Settings.Default.TweetIndex = counter;
                Properties.Settings.Default.Save();
            } else {
                return;
            }

            foreach (var hashtag in tweetData.Hashtags()) {
                Hashtag extantTag;
                if (!db.Hashtags.Select(i => i.Tag).Contains(hashtag)) {
                    Debug.Print("new tag: " + hashtag);
                    extantTag = new Hashtag() { Tag = hashtag };
                    db.Hashtags.AddObject(extantTag);
                    db.SaveChanges();
                } else {
                    extantTag = db.Hashtags.Where(i => i.Tag == hashtag).Single();
                }

                db.TweetHashtags.AddObject(new TweetHashtag() { TweetID = newTweet.ID, HashtagID = extantTag.ID });
                db.SaveChanges();
            }

            foreach (string matchedUrl in tweetData.Urls()) {
                string uniqueUrl = "";
                try {
                    uniqueUrl = matchedUrl.ExpandUrl().UnuiqeUrl();
                } catch (Exception ex) {
                    continue;
                }

                ///Find matched websites:
                var knownSite = db.Websites.Where(i => i.Url == uniqueUrl);
                if (knownSite.Count() > 0) {
                    ///
                    var tweetWebsite = new TweetWebsite() { TweetID = newTweet.ID, WebsiteID = knownSite.First().ID };
                    db.TweetWebsites.AddObject(tweetWebsite);
                    db.SaveChanges();
                    continue;
                }
                ///This is a brand new website
                if (uniqueUrl.Count() > 600 || db.Websites.Any(i => i.Url == uniqueUrl)) {
                    continue;
                }
                Debug.Print("Website: " + uniqueUrl);
                string websiteUrl;
                try {
                    var page = new PageScraper(uniqueUrl);
                    string longUrlTitle = "";
                    string description = "";

                    try {
                        longUrlTitle = page.Url.GetTitle();
                    } catch { Debug.Print("Failed to get title"); }
                    try {
                        description = page.Url.GetDescription();
                    } catch { Debug.Print("Failed to get description"); }
                    var website = new DB2.Website() {
                        Url = page.Url,
                        Title = string.Concat((page.Title() ?? "").Take(300)),
                        TweetID = newTweet.ID,
                        LongUrlTitle = string.Concat(longUrlTitle.Take(100)),
                        Description = string.Concat(description.Take(500))
                    };
                    db.Websites.AddObject(website);
                    db.SaveChanges();

                    var tweetWebsite = new TweetWebsite() { TweetID = newTweet.ID, WebsiteID = website.ID };
                    db.TweetWebsites.AddObject(tweetWebsite);
                    db.SaveChanges();
                    foreach (var m2 in page.Media()) {
                        if (m2.Link.Count() > 600 || db.Media.Any(i => i.Url == m2.Link)) {
                            continue;
                        }
                        try {
                            DB2.Medium media = new DB2.Medium() { Type = m2.Type, Url = m2.Link, SourceSiteID = website.ID };
                            if (m2.Type == "image") {
                                var request = WebRequest.Create(m2.Link);

                                using (var response = request.GetResponse())
                                using (var stream = response.GetResponseStream())
                                using (var b = System.Drawing.Bitmap.FromStream(stream)) {
                                    media.Width = b.Width;
                                    media.Height = b.Height;
                                }
                            }
                            db.Media.AddObject(media);
                            db.SaveChanges();
                            Debug.Print("Media element: " + m2.Link);
                        } catch {
                            break;
                        }
                    }
                    websiteUrl = website.Url;
                } catch {
                    Debug.Print("Failed to scrape page: " + uniqueUrl);
                }
            }
            db.SaveChanges();
        }