Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            //StreamReader sr = new StreamReader(@"account_csv.log");
            StreamReader sr = new StreamReader(args[0]);
            bool isCrawTrainData = true;
            string imagePath = "images";
            string corpusPath = "corpus";
            System.Net.WebClient wc = new System.Net.WebClient();

            ulong savedId = Properties.Settings.Default.Last_ID;
            bool isLoad = false;
            if (savedId != 0) isLoad = true;

            var tokens = TwitterAPI.getTokens();
            System.Text.RegularExpressions.Regex r =
    new System.Text.RegularExpressions.Regex(@"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+");

            //画像保存用
            if (!Directory.Exists(imagePath) && isCrawTrainData) Directory.CreateDirectory(imagePath);
            //文書保存用
            if (!Directory.Exists(corpusPath) && isCrawTrainData) Directory.CreateDirectory(corpusPath);

            //ループ
            while (sr.Peek() > -1)
            {
                string id_str = sr.ReadLine().Split(',')[0];
                ulong account_id = Convert.ToUInt64(id_str);
                bool unMedia = true;

                //goto saved pointer
                if (isLoad && savedId != account_id) continue;
                if (savedId == account_id)
                {
                    isLoad = false;
                    continue;
                }

                Console.WriteLine(id_str+":");
                UserResponse user=null;

                //UserResponce取得
                userFirst:
                try
                {
                    user = tokens[nowTokenIndex].Users.Show(id => account_id);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(("user:"******"Rate limit exceeded")
                    {
                        System.Threading.Thread.Sleep(new TimeSpan(0, 1, 0));
                        goto userFirst;
                    }
                    else if (ex.Message == "Over capacity")
                    {
                        System.Threading.Thread.Sleep(new TimeSpan(0, 1, 0));
                        goto userFirst;
                    }
                }

                try
                {
                    uint counter = Properties.Settings.Default.counter;
                    List<int> RTList = new List<int>(), FavList = new List<int>();
                    List<string> myCorpus = new List<string>();
                    List<int> CorpusRT = new List<int>();
                    List<long> CorpusID = new List<long>();
                    List<string> CorpusTime = new List<string>();

                    //user info
                    userInfo uf = new userInfo(user);

                    //get tweet
                    var lines = CoreTweetExtend.GetUserTimeLines(tokens, account_id, 3000, new TimeSpan(0, 1, 0), (n) => Console.WriteLine(n));

                    //画像用ファイル
                    string myImagePath = Path.Combine(imagePath, user.Id.Value.ToString());
                    if (!Directory.Exists(myImagePath) && isCrawTrainData) Directory.CreateDirectory(myImagePath);

                    foreach (var str in lines)
                    {
                        RTList.Add(str.RetweetCount.Value);
                        FavList.Add(str.FavoriteCount.Value);

                        //Mediaのみ取得
                        if (str.Entities.Media != null || !isCrawTrainData)
                        {
                            //文章手直し
                            string replaced = r.Replace(str.Text, "");
                            replaced = replaced.Replace("\r", "");
                            replaced = replaced.Replace("\n", "");
                            if (isCrawTrainData)
                            {
                                replaced = replaced.Replace(",", "、");
                                replaced = replaced.Replace(".", "。");
                            }

                            if (replaced != "")
                            {
                                myCorpus.Add(replaced);
                                CorpusRT.Add(str.RetweetCount.Value);
                                CorpusID.Add(str.Id);
                                CorpusTime.Add(str.CreatedAt.ToString("yyyy/MM/dd HH:mm:ss"));

                                //メディア保存
                                if (isCrawTrainData)
                                {
                                    unMedia = false;
                                    string tweetPath = Path.Combine(myImagePath, str.Id.ToString());
                                    Directory.CreateDirectory(tweetPath);
                                    foreach(var media in str.Entities.Media)
                                    {
                                        string thisPath = Path.Combine(tweetPath, Path.GetFileName(media.MediaUrl));
                                        wc.DownloadFile(media.MediaUrl, thisPath);

                                        Console.WriteLine(thisPath+":"+counter);
                                    }
                                }
                            }
                        }
                    }

                    if (!unMedia)
                    {

                        //統計計算
                        uf.RTMean = RTList.Average();
                        uf.RTDev = Math.Sqrt(RTList.Select(t => Math.Pow(t - uf.RTMean, 2.0)).Sum() / RTList.Count());
                        uf.FavMean = FavList.Average();
                        uf.FavDev = Math.Sqrt(FavList.Select(t => Math.Pow(t - uf.FavMean, 2.0)).Sum() / FavList.Count());

                        //ユーザ情報書き込み
                        if (isCrawTrainData)
                        {
                            using (StreamWriter sw_ui = new StreamWriter("user_info.txt", true, Encoding.UTF8))
                            {
                                sw_ui.WriteLine(uf.ToString());
                            }
                        }

                        //コーパス書き込み
                        string mycpPath = Path.Combine(corpusPath, user.Id + ".txt");
                        using (StreamWriter cw_cp = new StreamWriter(mycpPath, true, Encoding.UTF8))
                        {
                            foreach (string cp in myCorpus.Zip(CorpusID, (first, second) => string.Format("{0},{1}", first, second)).Zip(CorpusRT, (first, second) => string.Format("{0},{1}", first, second)))
                            {
                                cw_cp.WriteLine(cp);
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
                Properties.Settings.Default.Last_ID = account_id;
                Console.WriteLine(Properties.Settings.Default.counter);
                Properties.Settings.Default.counter++;
                Properties.Settings.Default.Save();
            }
        }