コード例 #1
0
ファイル: Crawler.cs プロジェクト: ic4f/oldcode
        public void Run()
        {
            string nextAccount;
            string accountPageUrl;
            string accountPageSource;

            int counter = 0;

            while (accounts.Count > 0)
            {
                Thread.Sleep(2000);

                nextAccount = accounts.Dequeue();

                if (!twuserData.Exists(nextAccount))
                {
                    accountPageUrl    = "http://twitter.com/" + nextAccount;
                    accountPageSource = getPageSource(accountPageUrl, false);

                    if (accountPageSource != null && accountPageSource != "")
                    {
                        Console.WriteLine("Processing account #" + (counter++) + ": " + nextAccount);
                        Console.WriteLine("Queue size = " + accounts.Count);

                        processTwuser(nextAccount, accountPageSource);
                        processFollowing(nextAccount, accountPageSource);
                        processTweets(nextAccount, accountPageSource);
                    }
                    else
                    {
                        Console.WriteLine("ERROR: empty page for " + nextAccount);
                    }
                }
            }
        }
コード例 #2
0
        public void Run()
        {
            string    seedAccount, seedPageUrl, seedPageSource;
            string    followeeAccount, followeePageUrl, followeePageSource;
            DataTable dtFollowees;

            HashSet <string> visited = new HashSet <string>();

            DataTable dt = twuserData.GetSeedIds();

            foreach (DataRow dr in dt.Rows)
            {
                seedAccount = dr[0].ToString();

                if (!visited.Contains(seedAccount))
                {
                    visited.Add(seedAccount);

                    seedPageUrl    = "http://twitter.com/" + seedAccount;
                    seedPageSource = getPageSource(seedPageUrl);

                    if (seedPageSource != null && seedPageSource != "")
                    {
                        seedCounter++;
                        Console.WriteLine("Processing account " + (seedCounter) + " of " + dt.Rows.Count + ": " + seedAccount);

                        processTweets(seedAccount, seedPageSource);         //must be done again: the seed account's tweets must be from the same time as the followee's tweets

                        dtFollowees = twuserData.GetFollowees(seedAccount); //because we use only existing accounts for this loop - i.e., they already have been processed, so we have followees for them.
                        foreach (DataRow drFollowee in dtFollowees.Rows)
                        {
                            followeeAccount = drFollowee[0].ToString();

                            if (!visited.Contains(followeeAccount))
                            {
                                visited.Add(followeeAccount);

                                followeePageUrl    = "http://twitter.com/" + followeeAccount;
                                followeePageSource = getPageSource(followeePageUrl);

                                if (followeePageSource != null && followeePageSource != "")
                                {
                                    if (!twuserData.Exists(followeeAccount))
                                    {
                                        processTwuser(followeeAccount, followeePageSource);
                                    }
                                    processFollowing(followeeAccount, followeePageSource);
                                    processTweets(followeeAccount, followeePageSource);  //again, same tweets are added to repeat users!
                                }
                            }
                        }

                        twuserData.MarkFolloweesAdded(seedAccount);
                    }
                    else
                    {
                        Console.WriteLine("ERROR: empty page for " + seedAccount);
                    }
                }
            }
        }