Пример #1
0
        private bool skipAll; //used to skip accounts already processed and resume processing after a given account

        public BoundedCrawler()
        {
            twuserData     = new d.TwuserData();
            tweetData      = new d.TweetData();
            twuserLinkData = new d.TwuserLinkData();
            seedCounter    = 0;
            // skipAll = true;
        }
Пример #2
0
 public Crawler(string seedAccountId)
 {
     accounts = new Queue <string>();
     accounts.Enqueue(seedAccountId);
     loadCookies();
     twuserData      = new d.TwuserData();
     tweetData       = new d.TweetData();
     tlinkData       = new d.TwuserLinkData();
     mayEnqueue      = true;
     skipNewAccounts = 1000;
 }
Пример #3
0
        public void ExtractHashTags()
        {
            Hashtable hashtags = new Hashtable();

            d.TweetData             tData             = new d.TweetData();
            d.HahstagData           hData             = new d.HahstagData();
            d.TweetHashtagLinkData  tweetHashtagData  = new d.TweetHashtagLinkData();
            d.TwuserHashtagLinkData twuserHashtagData = new d.TwuserHashtagLinkData();
            DataTable dt = tData.GetHashTagTweeets();

            string hashtagPattern = @"(?<tag>#[^\s]+)";

            int counter = 0;

            int    tweetId, tagId;
            string twuserId, text, nohtmltext, tag;

            foreach (DataRow dr in dt.Rows)
            {
                Console.WriteLine("tweet# " + (++counter));

                tweetId    = Convert.ToInt32(dr[0]);
                twuserId   = dr[1].ToString();
                text       = dr[2].ToString();
                nohtmltext = dr[3].ToString();

                Match matchPattern = Regex.Match(nohtmltext, hashtagPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);

                while (matchPattern.Success)
                {
                    tag = matchPattern.Groups["tag"].ToString().Trim();

                    tag = normalize(tag);
                    if (tag != "")
                    {
                        if (!hashtags.Contains(tag))
                        {
                            tagId = hData.Create(tag);
                            hashtags.Add(tag, tagId);
                        }
                        else
                        {
                            tagId = Convert.ToInt32(hashtags[tag]);
                        }

                        tweetHashtagData.Create(tweetId, tagId);
                        twuserHashtagData.Create(twuserId, tagId);
                    }

                    matchPattern = matchPattern.NextMatch();
                }
            }
        }
Пример #4
0
        public void ExtractText()
        {
            d.TweetData tData = new d.TweetData();
            DataTable   dt    = tData.GetRecords();
            Regex       regex = new Regex("<[^>]*>");
            int         id;
            string      twuserId, text, nohtmltext;
            int         counter = 0;

            foreach (DataRow dr in dt.Rows)
            {
                id         = Convert.ToInt32(dr[0]);
                twuserId   = dr[1].ToString();
                text       = dr[2].ToString();
                nohtmltext = regex.Replace(text, "");
                tData.UpdateData(id, nohtmltext);
                Console.WriteLine(++counter);
            }
        }