/// <summary> /// Get Page Token and Cluster Token for play store streaming search result. /// </summary> /// <param name="response">Response body</param> /// <returns>Page Token and Cluster Token</returns> protected static ClusterAndToken getPageAndClusterTokens(string response) { ClusterAndToken cat_cl = null; string pagTok = string.Empty; string clpTok = string.Empty; // I NEED A BETTER REGEX THAT WOULD ONLY TAKE THESE GUYS. FOR NOW IT TAKES ALL DIFFERENT KIND OF SHIT, SO I GOTTA SPLIT IT ._. Regex pagTokenRegex = new Regex(@"-p6B+.+\:S\:.{11}", RegexOptions.Compiled); Regex clpTokenRegex = new Regex(@"ggE+.+\:S\:.{11}", RegexOptions.Compiled); Match pagTokenMatch = pagTokenRegex.Match(response); Match clpTokenMatch = clpTokenRegex.Match(response); if (pagTokenMatch.Success && clpTokenMatch.Success) { cat_cl = new ClusterAndToken(); string dirtyPagToken = pagTokenMatch.Value.Replace("\\\\u003d", "="); string [] splitDirtyPagToken = dirtyPagToken.Split(new string[] { "\\x22" }, StringSplitOptions.None); string cleanPagToken = splitDirtyPagToken[0]; cat_cl.pagTok = cleanPagToken; string dirtyClusterToken = clpTokenMatch.Value; string[] splitDirtyClusterToken = dirtyClusterToken.Split(new string[] { "\">" }, StringSplitOptions.None); string cleanClusterToken = splitDirtyClusterToken[0]; cat_cl.clp = cleanClusterToken; return(cat_cl); } else { return(cat_cl); } }
public static List <AppShortDescription> CollectAppsShortInformationFromKeywords(string keyword) { List <AppShortDescription> parsedApps_list = new List <AppShortDescription> (); log.Info("Crawling Search Term : [ " + keyword + " ]"); string crawlUrl = String.Format(Consts.CRAWL_URL_KEYWORD_INITIAL, keyword, "Russia", "ru"); string postData = Consts.POST_DATA_KEYWORD_INITAL; // HTML Response string response = string.Empty; // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Host = Consts.HOST; // this is how we actually connect to all this shit // the only thing left - we need to randomize it and check if 200 //WebProxy proxyObject = new WebProxy("http://" + ProxyLoader.ReturnRandomProxy(), true); //server.Proxy = proxyObject; int insertedAppCount = 0; int skippedAppCount = 0; int errorsCount = 0; do { // Executing Request response = server.Post(crawlUrl, postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { log.Error("Http Error - Status Code: " + server.StatusCode); errorsCount++; if (errorsCount > Consts.MAX_REQUEST_ERRORS) { log.Info("Crawl Stopped: MAX_REQUEST_ERRORS reached"); break; } else { continue; } } //var kek1 = parser.ParseAppUrls(response); // Parsing Links out of Html Page foreach (AppShortDescription asd in parser.ParseAppUrls(response)) { if (!parsedApps_list.Contains(asd)) { parsedApps_list.Add(asd); log.Info("Inserted App: " + asd); ++insertedAppCount; //if (maxAppUrls > 0 && insertedAppCount >= maxAppUrls) //{ // goto exit; //} } else { ++skippedAppCount; log.Info("Duplicated App. Skipped: " + asd); } } // Get pagTok value that will be used to fetch next stream data. // If not found, that means we have reached the end of stream. ClusterAndToken cat_cl = getPageAndClusterTokens(response); if (cat_cl == null) { break; } else { crawlUrl = Consts.CRAWL_URL_KEYWORD_CLUSTER; postData = String.Format(Consts.POST_DATA_KEYWORD_CLUSTER, cat_cl.clp, cat_cl.pagTok); } Console.WriteLine("Inserted apps: " + insertedAppCount + "."); } while (true); exit: log.Info("Inserted App Count: " + insertedAppCount); log.Info("Skipped App Count: " + skippedAppCount); log.Info("Error Count: " + errorsCount + "\n"); } return(parsedApps_list); }