public void ProcessRequest(HttpContext context) { context.Response.ContentType = "text/plain"; try { Stream httpStream = context.Request.InputStream; StreamReader httpStreamReader = new StreamReader(httpStream); Resource newResource = FHIRUtilities.StreamToFHIR(httpStreamReader); _patient = (Patient)newResource; //find all patient without fingerprints. should be a small sample. //if found, return localid //if not found, return "no match found" MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); AlternateSearch altSearch = GetAlternateFromPatient(_patient); string localNoID = dbwrapper.AlternateSearch(altSearch); if (localNoID.ToLower().Contains("noid://") == false) { dbwrapper.AddAlternateSearch(altSearch); _responseText = "no match found"; } else { _responseText = localNoID; } } catch (Exception ex) { _responseText = "Error in AltMatchByDemographics::ProcessRequest: " + ex.Message; LogUtilities.LogEvent(_responseText); } context.Response.Write(_responseText); context.Response.End(); }
static void Main(string[] args) { string commandLine = ""; Console.WriteLine("Enter C for checkin patient, P for pending patient queue, M for Mongo tests, F for fingerprint identity and Q to quit"); while (commandLine != "q") { if (commandLine == "c") { // call PatentCheckinUri Console.WriteLine("Sending test patient FHIR message."); Patient testPt = TestPatient(); SendJSON(testPt); Console.WriteLine("Sending FHIR message from file."); Patient readPt = ReadPatient(@"C:\JSONTest\sample-new-patient.json"); SendJSON(readPt); } else if (commandLine == "p") //send profiles { // call PendingPatientsUri IList <PatientProfile> patientProfiles = GetCheckinList(); Console.WriteLine("Patient profiles received."); } else if (commandLine == "m") // MongoDB tests { MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); SessionQueue seq = new SessionQueue(); seq._id = Guid.NewGuid().ToString(); seq.ClinicArea = "Test Clinic"; seq.LocalReference = "123456"; seq.SparkReference = "spark5"; seq.ApprovalStatus = "pending"; seq.PatientStatus = "new"; seq.RemoteHubReference = "rem440403"; seq.SessionComputerName = "Prototype Computer 1"; seq.SubmitDate = DateTime.UtcNow.AddMinutes(-15); seq.PatientBeginDate = DateTime.UtcNow.AddMinutes(-19); Console.WriteLine(seq.Serialize()); dbwrapper.AddPendingPatient(seq); List <SessionQueue> PendingPatients = dbwrapper.GetPendingPatients(); dbwrapper.UpdateSessionQueueRecord(seq._id, "approved", "TestUser", "TestComputer"); } else if (commandLine == "f") // test fingerprint identity web service { Media readMedia = ReadMedia(@"C:\JSONTest\sample-media-fhir-message.json"); SendJSON(readMedia); } string previousCommand = commandLine; commandLine = Console.ReadLine(); if (commandLine.Length > 0) { commandLine = commandLine.ToLower().Substring(0, 1); } else { commandLine = previousCommand; } } }
public void TestFixtureSetup() { _mongoTest = new MongoDBWrapper(); _mongoTest.KillServerAndCleanup(); _mongoTest.StartServer(); _mongoTest.ConnectToTestDB(); _mongoTest.LoadDataset(); }
// Static Constructor public MongoDBRepository() { _logger = LogManager.GetCurrentClassLogger(); _mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); }
public void ProcessRequest(HttpContext context) { context.Response.ContentType = "text/plain"; try { foreach (String key in context.Request.QueryString.AllKeys) { switch (key) { case "sessionid": _sessionID = context.Request.QueryString[key]; break; case "action": _action = context.Request.QueryString[key]; break; case "computername": _computerName = context.Request.QueryString[key]; break; case "username": _userName = context.Request.QueryString[key]; break; } } MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); if (dbwrapper.UpdateSessionQueueRecord(_sessionID, _action, _userName, _computerName) == false) { if (dbwrapper.Exceptions.Count > 0) { string errorMessage = dbwrapper.Exceptions[0].Message; context.Response.Write("UpdatePendingStatus::ProcessRequest Error: " + errorMessage); } else { context.Response.Write("UpdatePendingStatus::ProcessRequest Error: Could not find sessionID " + _sessionID + "."); } } else { context.Response.Write("Successfully updated the pending status."); } } catch (Exception ex) { context.Response.Write("UpdatePendingStatus::ProcessRequest Error: " + ex.Message); } context.Response.End(); }
private IList <PatientProfile> GetPendingPatients() { List <PatientProfile> listPending = new List <PatientProfile>(); try { MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); List <SessionQueue> pendingSessionList = dbwrapper.GetPendingPatients(); FhirClient client = new FhirClient(sparkEndpointAddress); foreach (var pending in pendingSessionList) { string sparkAddress = sparkEndpointAddress.ToString() + "/Patient/" + pending.SparkReference; Patient pendingPatient = (Patient)client.Get(sparkAddress); PatientProfile patientProfile = new PatientProfile(pendingPatient, true); patientProfile.SessionID = pending._id; patientProfile.LocalNoID = pending.LocalReference; patientProfile.NoIDStatus = pending.ApprovalStatus; patientProfile.NoIDType = pending.PatientStatus; patientProfile.CheckinDateTime = FHIRUtilities.DateTimeToFHIRString(pending.SubmitDate); listPending.Add(patientProfile); } /* * string gtDateFormat = "gt" + FHIRUtilities.DateToFHIRString(DateTime.UtcNow.AddDays(-2)); * client.PreferredFormat = ResourceFormat.Json; * Uri uriTwoDays = new Uri(sparkEndpointAddress.ToString() + "/Patient?_lastUpdated=" + gtDateFormat); * Bundle patientBundle = (Bundle)client.Get(uriTwoDays); * foreach (Bundle.EntryComponent entry in patientBundle.Entry) * { * string ptURL = entry.FullUrl.ToString().Replace("http://localhost:49911/fhir", sparkEndpointAddress.ToString()); * Patient pt = (Patient)client.Get(ptURL); * if (pt.Meta.Extension.Count > 0) * { * Extension ext = pt.Meta.Extension[0]; * if (ext.Value.ToString().ToLower().Contains("pending") == true) * { * PatientProfile patientProfile = new PatientProfile(pt, false); * listPending.Add(patientProfile); * } * } * } */ } catch (Exception ex) { throw ex; } return(listPending); }
public void ProcessRequest(HttpContext context) { context.Response.ContentType = "text/plain"; string purgeResult = ""; string destroyKey = ""; try { if (uint.TryParse(MinimumAcceptedMatchScore, out _minimumAcceptedMatchScore) == false) { _minimumAcceptedMatchScore = 30; } foreach (String key in context.Request.QueryString.AllKeys) { if (key == "destroykey") { destroyKey = context.Request.QueryString[key]; break; } } if (destroyKey == DestroyKey) { MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); if (dbwrapper.DeleteMongoDBs() == true) { FingerPrintMatchDatabase dbMinutia = new FingerPrintMatchDatabase(DatabaseLocation, BackupLocation, _minimumAcceptedMatchScore); if (dbMinutia.DeleteMatchDatabase()) { purgeResult = "Successful."; } else { purgeResult = "Error in PurgeAllDatabases::ProcessRequest: Unable to delete all databases."; } } } else { //TODO: log this event as an invalid attempt due to mismatched keys. } } catch (Exception ex) { purgeResult = "Error in PurgeAllDatabases::ProcessRequest: " + ex.Message; } context.Response.Write(purgeResult); }
static void Main(string[] args) { // Loading Keen.IO Keys and Misc. from Config File _keenIOProjectID = ConfigurationManager.AppSettings["keenIOProjectID"]; _keenIOMasterKey = ConfigurationManager.AppSettings["keenIOMasterKey"]; _keenIOWriteKey = ConfigurationManager.AppSettings["keenIOWriteKey"]; _keenIOReadKey = ConfigurationManager.AppSettings["keenIOReadKey"]; _bucketName = ConfigurationManager.AppSettings["keenIOBucketName"]; // Configuring MongoDB Wrapper for connection and queries MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Keen.IO Variables var projectSettings = new ProjectSettingsProvider(_keenIOProjectID, _keenIOMasterKey, _keenIOWriteKey, _keenIOReadKey); var keenClient = new KeenClient(projectSettings); // From This point on, you can change your code to reflect your own "Reading" logic // What I've done is simply read the records from the MongoDB database and Upload them to Keen.IO foreach (var currentApp in mongoDB.FindMatch <AppModel> (Query.NE("Uploaded", true))) { try { // Adding Event to Keen.IO keenClient.AddEvent("PlayStore2014", currentApp); // Incrementing Counter _appsCounter++; // Console feedback Every 100 Processed Apps if (_appsCounter % 100 == 0) { Console.WriteLine("Uploaded : " + _appsCounter); } mongoDB.SetUpdated(currentApp.Url); } catch (Exception ex) { Console.WriteLine("\n\t" + ex.Message); } } }
static void Main (string[] args) { // Loading Keen.IO Keys and Misc. from Config File _keenIOProjectID = ConfigurationManager.AppSettings["keenIOProjectID"]; _keenIOMasterKey = ConfigurationManager.AppSettings["keenIOMasterKey"]; _keenIOWriteKey = ConfigurationManager.AppSettings["keenIOWriteKey"]; _keenIOReadKey = ConfigurationManager.AppSettings["keenIOReadKey"]; _bucketName = ConfigurationManager.AppSettings["keenIOBucketName"]; // Configuring MongoDB Wrapper for connection and queries MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Keen.IO Variables var projectSettings = new ProjectSettingsProvider (_keenIOProjectID, _keenIOMasterKey, _keenIOWriteKey, _keenIOReadKey); var keenClient = new KeenClient (projectSettings); // From This point on, you can change your code to reflect your own "Reading" logic // What I've done is simply read the records from the MongoDB database and Upload them to Keen.IO foreach (var currentApp in mongoDB.FindMatch<AppModel> (Query.NE ("Uploaded", true))) { try { // Adding Event to Keen.IO keenClient.AddEvent ("PlayStore2014", currentApp); // Incrementing Counter _appsCounter++; // Console feedback Every 100 Processed Apps if (_appsCounter % 100 == 0) { Console.WriteLine ("Uploaded : " + _appsCounter); } mongoDB.SetUpdated (currentApp.Url); } catch (Exception ex) { Console.WriteLine ("\n\t" + ex.Message); } } }
private static async Task ReadPosts(SqlConnection connection, MongoDBWrapper wrapper) { while (true) { // Read from Mongo var nextPost = await wrapper.GetNextPost(); if (nextPost == null) { break; } Console.WriteLine($"ReadPosts: Read {nextPost.Id}"); using var transaction = connection.BeginTransaction(); // Write To Sql Server string sql = "DECLARE @newRecord table(newId uniqueidentifier); " + "INSERT INTO Post " + "(Text, WorkoutDate) " + "OUTPUT INSERTED.Id INTO @newRecord " + "VALUES " + "(@text, @workoutDate) " + "SELECT CONVERT(nvarchar(50), newId) FROM @newRecord"; var result = await connection.QueryAsync <string>(sql, new { text = nextPost.Text, workoutDate = nextPost.WorkoutDate }, transaction); // Get all comments for post await ReadComments(transaction, connection, result.Single(), nextPost.Id.ToString(), wrapper); transaction.Commit(); } Console.WriteLine("ReadPosts: End"); }
public async Task <HttpResponseMessage> RunCTAlgorithm(HttpRequestMessage request) { try { //Logging Info MLPExecutionLogger.Info("CTPhantom", "Calling Jenkins Job, IP: " + HttpContext.Current.Request.UserHostAddress + ", Client: " + HttpContext.Current.Request.Url.AbsoluteUri); MongoDBWrapper mongoBase = new MongoDBWrapper(); var jsonString = await request.Content.ReadAsStringAsync(); BsonDocument collection = BsonDocument.Parse(jsonString); var result = await mongoBase.RunJenkinsJob(collection); return(Request.CreateResponse(HttpStatusCode.OK, result)); } catch (Exception ex) { MLPExecutionLogger.Error("CTPhantom", ex.Message); return(Request.CreateResponse(HttpStatusCode.InternalServerError, ex.Message)); } }
public void ProcessRequest(HttpContext context) { try { bool biometricsSaved = false; string missingReason = ""; string question1 = ""; string question2 = ""; string answer1 = ""; string answer2 = ""; if (uint.TryParse(MinimumAcceptedMatchScore, out _minimumAcceptedMatchScore) == false) { _minimumAcceptedMatchScore = 30; } Stream httpStream = context.Request.InputStream; StreamReader httpStreamReader = new StreamReader(httpStream); Resource newResource = FHIRUtilities.StreamToFHIR(httpStreamReader); _patient = (Patient)newResource; //TODO: make sure this FHIR message has a new pending status. //TODO: make this an atomic transaction. // delete the FHIR message from Spark if there is an error in the minutia. Patient ptSaved = (Patient)SendPatientToSparkServer(); //LogUtilities.LogEvent("AddNewPatient.ashx Saved FHIR in spark."); if (ptSaved == null) { _responseText = "Error sending Patient FHIR message to the Spark FHIR endpoint. " + ExceptionString; return; } SourceAFIS.Templates.NoID noID = new SourceAFIS.Templates.NoID(); noID.SessionID = ptSaved.Id.ToString(); //TODO: Add Argon2d hash here: noID.LocalNoID = "noid://" + DomainName + "/" + StringUtilities.SHA256(DomainName + noID.SessionID + NodeSalt); SessionQueue seq = Utilities.PatientToSessionQueue(_patient, ptSaved.Id.ToString(), noID.LocalNoID, "new", "pending"); seq.SubmitDate = DateTime.UtcNow; //TODO: send to selected match hub and get the remote hub ID. // Hub ID in the same format: noid://domain/LocalID if (_patient.Photo.Count > 0) { dbMinutia = new FingerPrintMatchDatabase(DatabaseDirectory, BackupDatabaseDirectory, _minimumAcceptedMatchScore); foreach (var minutia in _patient.Photo) { byte[] byteMinutias = minutia.Data; Stream stream = new MemoryStream(byteMinutias); Media media = (Media)FHIRUtilities.StreamToFHIR(new StreamReader(stream)); // Save minutias for matching. Template fingerprintTemplate = ConvertFHIR.FHIRToTemplate(media); fingerprintTemplate.NoID = noID; try { dbMinutia.LateralityCode = (FHIRUtilities.LateralitySnoMedCode)fingerprintTemplate.NoID.LateralitySnoMedCode; dbMinutia.CaptureSite = (FHIRUtilities.CaptureSiteSnoMedCode)fingerprintTemplate.NoID.CaptureSiteSnoMedCode; } catch { } if (dbMinutia.AddTemplate(fingerprintTemplate) == false) { _responseText = "Error adding a fingerprint to the match database."; } } dbMinutia.Dispose(); biometricsSaved = true; } else { // check alternate pathway Q&A foreach (var id in _patient.Identifier) { if (id.System.ToLower().Contains("biometric") == true) { Extension extExceptionQA = id.Extension[0]; foreach (var ext in extExceptionQA.Extension) { if (ext.Url.ToLower().Contains("reason") == true) { missingReason = ext.Value.ToString(); } else if (ext.Url.ToLower().Contains("question 1") == true) { question1 = ext.Value.ToString(); } else if (ext.Url.ToLower().Contains("answer 1") == true) { answer1 = ext.Value.ToString(); } else if (ext.Url.ToLower().Contains("question 2") == true) { question2 = ext.Value.ToString(); } else if (ext.Url.ToLower().Contains("answer 2") == true) { answer2 = ext.Value.ToString(); } } if ( missingReason.Length > 0 && question1.Length > 0 && answer1.Length > 0 && question2.Length > 0 && answer2.Length > 0 ) { if (missingReason != "I am permanently physically unable to provide fingerprints") { if (missingReason == "I am temporarily physically unable to provide fingerprints") { seq.PatientStatus = "hold**"; } else if (missingReason == "I attempted the fingerprint scan process, but I could not get a successful scan on either hand") { seq.PatientStatus = "hold"; } } else { seq.PatientStatus = "new***"; } biometricsSaved = true; } } } // log patient in alternatesearch container } if (biometricsSaved) { MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); dbwrapper.AddPendingPatient(seq); } else { _responseText = "Critical Error! No biometrics or alternates provided. Can not complete enrollment."; LogUtilities.LogEvent(_responseText); } //TODO: end atomic transaction. _responseText = "Successful."; //LogUtilities.LogEvent("Ending AddNewPatient.ashx"); } catch (Exception ex) { _responseText = "Error in AddNewPatient::ProcessRequest: " + ex.Message; LogUtilities.LogEvent(_responseText); } context.Response.Write(_responseText); context.Response.End(); }
/// <summary> /// /// </summary> /// <param name="categoryUrl"></param> private static void CrawlCategory(string categoryUrl, string categoryName, bool shouldUseProxies) { // Console Feedback _logger.Warn("Crawling Category : [ " + categoryName + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet <String> foundUrls = new HashSet <String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Executing Initial Request response = server.Get(categoryUrl); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Saving found url on local hashset foundUrls.Add(url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); } } // Executing Requests for more Play Store Links int baseSkip = 60; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format(Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip); // Executing request for values response = server.Post(String.Format(categoryUrl + "?authuser=0"), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // If a certain app is found twice, it means that the "pagging" logic got stuck into a // Loop, so the all the apps for this category were parsed already if (foundUrls.Contains(url)) { isDonePagging = true; break; } // Saving found url on local hashset foundUrls.Add(url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Entry point of the crawler /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Setting Up Log LogSetup.InitializeLog("PlayStoreCrawler.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use HTTP proxies or not if (args != null && args.Length == 1) { _logger.Info("Loading Proxies from File"); // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { _logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { _logger.Fatal(ex); System.Environment.Exit(-101); } } // Configuring MongoDB Wrapper _logger.Info("Setting up MongoDB Collections and Indexes"); _mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); _mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database collections have the proper indexes _mongoDB.EnsureIndex("Url"); _mongoDB.EnsureIndex("IsBusy", Consts.QUEUED_APPS_COLLECTION); _mongoDB.EnsureIndex("Url", Consts.QUEUED_APPS_COLLECTION); // Main Flow _logger.Info("Started Bootstrapping Steps"); // Scrapping "Play Store Categories" foreach (var categoriesKVP in BootstrapTerms.categoriesAndNames) { CrawlCategory(categoriesKVP.Key, categoriesKVP.Value, isUsingProxies); } // Queueing Apps that start with each of the characters from "A" to "Z" foreach (var character in BootstrapTerms.charactersSearchTerms) { CrawlStore(character, isUsingProxies); } /// ... Keep Adding characters / search terms in order to increase the crawler's reach // APP CATEGORIES foreach (var category in BootstrapTerms.categoriesSearchTerms) { CrawlStore(category, isUsingProxies); } // Extra "Random" Search terms to increase even more the crawler's reach foreach (var miscTerm in BootstrapTerms.miscSearchTerms) { CrawlStore(miscTerm, isUsingProxies); } // Country Names as Search terms to increase even more the crawler's reach foreach (var countryName in BootstrapTerms.countryNames) { CrawlStore(countryName, isUsingProxies); } }
static void Main (string[] args) { // Configuring Log Object Logger logger = LogManager.GetCurrentClassLogger (); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; logger.Info ("Checking proxies configuration"); // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists (fPath)) { logger.Fatal ("Couldnt find proxies on path : " + fPath); System.Environment.Exit (-100); } // Reading Proxies from File logger.Info ("Loading Proxies"); string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load (fLines.ToList ()); } catch (Exception ex) { logger.Fatal (ex); System.Environment.Exit (-101); } } // MongoDB instance Creation logger.Info ("Configuring MonboDB Client"); // Creating instance of Mongo Handler for the main collection MongoDBWrapper mongoClient = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoClient.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); logger.Info ("Iterating over Apps"); // Creating Play Store Parser PlayStoreParser parser = new PlayStoreParser (); // App Model AppModel appRecord; // Control Variable bool noError = true; // Finding all the "Apps" that didn't have the reviews visited yet while ((appRecord = mongoClient.FindAndModifyReviews ()) != null) { // Extracting app ID from URL string appId = appRecord.Url.Replace (Consts.PLAY_STORE_PREFIX, String.Empty); // Console Feedback logger.Info ("Processing App [ " + appRecord.Name + " ] "); try { // Console Feedback Console.Write ("Reviews from : " + appRecord.Name); // Issuing Request for Reviews string response = ReviewsWrapper.GetAppReviews (appId, 1, isUsingProxies); // Checking for Blocking Situation if (String.IsNullOrEmpty(response)) { logger.Info ("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 seconds Thread.Sleep (TimeSpan.FromSeconds (10)); } // Checking for "No Reviews" app if (response.Length < 50) { logger.Info ("No Reviews for this app. Skipping"); Console.Write (" - No Reviews Found\n"); continue; } // Normalizing Response to Proper HTML response = ReviewsWrapper.NormalizeResponse (response); // List of Reviews List<AppReview> reviews = new List<AppReview> (); // Iterating over Parsed Reviews foreach (var review in parser.ParseReviews (response)) { // Adding App Data to the review review.appID = appId; review.appName = appRecord.Name; review.appURL = appRecord.Url; // Capture Timestamp to the model review.timestamp = DateTime.Now; // Adding reviews to the current list reviews.Add (review); } // Any Review Found ? if (reviews.Count > 0) { Console.Write (" - " + reviews.Count + " Reviews Found\n"); // Checking if there was any previous list of reviews if (appRecord.Reviews == null) { appRecord.Reviews = reviews; } else // Previous List found - Appending only the new ones { foreach (var review in reviews) { if (!appRecord.Reviews.Any (t => t.permalink.Equals (review.permalink))) { appRecord.Reviews.Add (review); } } } } } catch (Exception ex) { logger.Error (ex); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine ("Error : " + ex.Message); Console.ForegroundColor = ConsoleColor.White; noError = false; } finally { // Toggling back the "ReviewsStatus" attribute from the model if (noError) { appRecord.ReviewsStatus = "Visited"; mongoClient.SaveRecord<AppModel> (appRecord); } else // "Error" status { appRecord.ReviewsStatus = "Error"; mongoClient.SaveRecord<AppModel> (appRecord); } } } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField) { // Console Feedback Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]"); // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match (response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, pagToken); // Executing request for values response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
private static int SendEventsToKeep(Keen.Core.KeenClient keenClient, List <AppModel> eventsToSend, MongoDBWrapper mongoDB) { try { // Adding Event to Keen.IO keenClient.AddEvents("PlayStore2014", eventsToSend); // Incrementing Counter _appsCounter += eventsToSend.Count; // Console feedback Every 100 Processed Apps if (_appsCounter % 100 == 0) { Console.WriteLine("Uploaded : " + _appsCounter); } foreach (var e in eventsToSend) { mongoDB.SetUpdated(e.Url); } return(eventsToSend.Count); } catch (Exception ex) { Console.WriteLine("\n\t" + ex.Message); } return(0); }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog ("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger (); logger.Info ("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists (fPath)) { logger.Fatal ("Couldnt find proxies on path : " + fPath); System.Environment.Exit (-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines (fPath, Encoding.GetEncoding ("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load (fLines.ToList ()); } catch (Exception ex) { logger.Fatal (ex); System.Environment.Exit (-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests (); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify ()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf ("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed (appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info ("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue (app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy (); } // Issuing HTTP Request string response = server.Get (appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info ("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests (); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds (2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes (20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info ("======================================================"); logger.Info ("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep (Convert.ToInt32 (waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info ("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue (app.Url); } logger.Info ("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage (response, appUrl); List<String> relatedApps = new List<String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps (response)) { relatedApps.Add (Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct ().ToArray (); } catch { logger.Info ("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.Insert<AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info ("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue (extraAppUrl); } } // Console Feedback logger.Info ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error (ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp (app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error (ex); } } } }
/// <summary> /// /// </summary> /// <param name="categoryUrl"></param> private static void CrawlCategory (string categoryUrl, string categoryName, bool shouldUseProxies) { // Console Feedback _logger.Warn ("Crawling Category : [ " + categoryName + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet<String> foundUrls = new HashSet<String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Executing Initial Request response = server.Get (categoryUrl); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Saving found url on local hashset foundUrls.Add (url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); } } // Executing Requests for more Play Store Links int baseSkip = 60; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format (Consts.CATEGORIES_POST_DATA, (currentMultiplier * baseSkip), baseSkip); // Executing request for values response = server.Post (String.Format (categoryUrl + "?authuser=0"), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // If a certain app is found twice, it means that the "pagging" logic got stuck into a // Loop, so the all the apps for this category were parsed already if (foundUrls.Contains (url)) { isDonePagging = true; break; } // Saving found url on local hashset foundUrls.Add (url); // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// *** READ THIS BEFORE YOU START. *** /// *** I MEAN IT, PLEASE, READ IT *** /// /// This exporting helper will download ALL THE APPS found on the database, and /// dump it to a CSV file (with headers). /// /// Note that, since the database is Hosted on AWS, i will PAY (for the internet outbound traffic) if you execute a full database export, /// so, if you are going to execute a full export, please, get in touch with me before running this project, or send me a donation /// via paypal on [email protected] /// /// Also, be nice with the database. /// /// ** END OF WARNING *** static void Main (string[] args) { // Logs Counter int processedApps = 0; // Configuring Log Object Logger logger = LogManager.GetCurrentClassLogger (); logger.Info ("Worker Started"); logger.Info ("Checking Arguments"); // Periodic Log Timer Timer loggingThread = new Timer((TimerCallback) => { logger.Info ("Processed Apps: " + processedApps); }, null, 10000, 10000); // Validating Arguments if (!ValidateArgs (args)) { logger.Fatal ("Invalid Args", "Args must have 1 element"); return; } logger.Info ("Checking Write Permissions on output Path"); // Validating Write Permissions on output path if (!ValidateFilePermissions (args[0])) { logger.Fatal ("Insuficient Permissions", "Cannot write on path : " + args[0]); return; } // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Opening Output Stream using (StreamWriter sWriter = new StreamWriter (args[0], true, Encoding.GetEncoding("ISO-8859-1"))) { // Auto Flush Content sWriter.AutoFlush = true; // Writing Headers String headersLine = "Url,ReferenceDate,Name,Developer,IsTopDeveloper,DeveloperURL,PublicationDate," + "Category,IsFree,Price,Reviewers,Score.Total,Score.Count,Score.FiveStars," + "Score.FourStars,Score.ThreeStars,Score.TwoStars,Score.OneStars,LastUpdateDate" + "AppSize,Instalations,CurrentVersion,MinimumOSVersion,ContentRating,HaveInAppPurchases,DeveloperEmail,DeveloperWebsite,DeveloperPrivacyPolicy"; sWriter.WriteLine (headersLine); // Example of MongoDB Query Construction // Queries for records which have the attribute "IsTopDeveloper" equal to "false" //var mongoQuery = Query.EQ ("IsTopDeveloper", false); var mongoQuery = Query.EQ ("Category", "/store/apps/category/SPORTS"); // More Examples of Queries // var mongoQuery = Query.EQ ("Category", "/store/apps/category/GAME_CASINO"); // var mongoQuery = Query.GT ("Price", 10); // Reading all apps from the database // USAGE: CHANGE FindMatches to FindAll if you want to export all the records from the database foreach (AppModel app in mongoDB.FindMatch<AppModel>(mongoQuery)) { try { // Writing line to File sWriter.WriteLine (app.ToString ()); processedApps++; } catch (Exception ex) { logger.Error (ex); } } } // Logging end of the Process logger.Info ("Finished Exporting Database"); }
public void ProcessRequest(HttpContext context) { try { if (uint.TryParse(MinimumAcceptedMatchScore, out _minimumAcceptedMatchScore) == false) { _minimumAcceptedMatchScore = 30; } Resource newResource = FHIRUtilities.StreamToFHIR(new StreamReader(context.Request.InputStream)); _biometics = (Media)newResource; // TODO send to biometric match engine. If found, add patient reference to FHIR message. // convert FHIR fingerprint message (_biometics) to AFIS template class Template probe = ConvertFHIR.FHIRToTemplate(_biometics); dbMinutia = new FingerPrintMatchDatabase(_databaseDirectory, _backupDatabaseDirectory, _minimumAcceptedMatchScore); try { dbMinutia.LateralityCode = (FHIRUtilities.LateralitySnoMedCode)probe.NoID.LateralitySnoMedCode; dbMinutia.CaptureSite = (FHIRUtilities.CaptureSiteSnoMedCode)probe.NoID.CaptureSiteSnoMedCode; } catch { } MinutiaResult minutiaResult = dbMinutia.SearchPatients(probe); if (minutiaResult != null) { if (minutiaResult.NoID != null && minutiaResult.NoID.Length > 0) { // Fingerprint found in database // check if patient is already pending. MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); string currentStatus = dbwrapper.GetCurrentStatus(minutiaResult.NoID); if (currentStatus.ToLower() != "pending") { _responseText = minutiaResult.NoID; //TODO: for now, it returns the localNoID. should return a FHIR response. } else { _responseText = "pending"; } LogUtilities.LogEvent(_responseText); } else { _responseText = "No local database match."; LogUtilities.LogEvent(_responseText); } } else { _responseText = "No local database match."; LogUtilities.LogEvent(_responseText); } dbMinutia.Dispose(); LogUtilities.LogEvent("After dbMinutia.Dispose();"); } catch (Exception ex) { _exception = ex; _responseText = ex.Message; } context.Response.Write(_responseText); context.Response.End(); }
static void Main(string[] args) { // Checking for Input Parameters if (args == null || args.Length != 1) { Console.WriteLine("Incorrect number of arguments received. Expected One"); System.Environment.Exit(-100); } // Human Readable Variable string inputFile = args[0]; // Checking if the Input file received exists if (!File.Exists(inputFile)) { Console.WriteLine(String.Format("Received input file does not exist : {0}", inputFile)); System.Environment.Exit(-101); } // App Status _appStatus = new Dictionary <String, AppStatusModel> (); // Creating Instance of Database Manager MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Parser PlayStoreParser dataParser = new PlayStoreParser(); goto PeopleData; using (WebRequests httpClient = new WebRequests()) { // Minor Configuration of the Http Client - Ensures that the requests response will be in english // By doing so, we have no problems parsing the dates to their proper formats httpClient.Headers.Add(Consts.ACCEPT_LANGUAGE); httpClient.Host = Consts.HOST; httpClient.Encoding = "utf-8"; httpClient.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Iterating over File Lines (App Urls) - To Extract Data, Not The Reviews Yet. foreach (string appUrl in File.ReadAllLines(inputFile)) { // Logging Progress Console.WriteLine("\n => Processing App : " + appUrl); // Executing Http Get Request for the Apps's Data - With max of 5 Retries String appDataResponse = String.Empty; int currentRetry = 0; do { // Http Get appDataResponse = httpClient.Get(appUrl); } while (String.IsNullOrWhiteSpace(appDataResponse) || ++currentRetry <= _maxRetries); // Sanity Check if (String.IsNullOrWhiteSpace(appDataResponse)) { Console.WriteLine("\t\t.Error - Failed to find page of app : " + appUrl + ". Skipping it"); continue; } Console.WriteLine("\t\t.Page Found. Firing Parser"); // Parsing App Data AppModel appData = dataParser.ParseAppPage(appDataResponse, appUrl); // Checking If this app is on the database already if (mongoDB.AppProcessed(appUrl)) { Console.WriteLine("\t\t.Previous Version of App Found. Updating It"); mongoDB.UpdateRecord(appData, "Url", appData.Url); // Updating App Status _appStatus.Add ( appData.Url, new AppStatusModel() { appId = appData.Url.Replace(Consts.PLAY_STORE_PREFIX, String.Empty), appUrl = appData.Url, appName = appData.Name, status = "Updated" } ); } else { Console.WriteLine("\t\t.No Previous Version of the App Found. Adding to Database"); mongoDB.Insert <AppModel> (appData); // Updating App Status _appStatus.Add ( appData.Url, new AppStatusModel() { appId = appData.Url.Replace(Consts.PLAY_STORE_PREFIX, String.Empty), appUrl = appData.Url, appName = appData.Name, status = "Inserted" } ); } } } Reviews: // Next Phase: Parse Reviews of those Apps Console.WriteLine("\n => Parsing Complete. Obtaining Reviews"); // Iterating again over app urls to parse the reviews from this app foreach (string appUrl in File.ReadAllLines(inputFile)) { // Reaching App Id string appID = _appStatus[appUrl].appId; // Reviews-Break-Parsing Flag bool shouldContinueParsing = true; // Parsing Review Pages from the apps for (int currentPage = 1; /* no stop condition */; currentPage++) { // Getting Reviews Data Bundle string reviewsData = ReviewsWrapper.GetAppReviews(appID, currentPage); // Checking for Blocking Situation if (String.IsNullOrEmpty(reviewsData)) { Console.WriteLine("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 Minutes Thread.Sleep(10 * 60 * 1000); } // Checking for "No Reviews" app if (reviewsData.Length < 50) { Console.WriteLine("No Reviews left for this app. Skipping"); break; } // Normalizing Response to Proper HTML reviewsData = ReviewsWrapper.NormalizeResponse(reviewsData); // Iterating over Parsed Reviews foreach (var review in dataParser.ParseReviews(reviewsData)) { // Adding App Data to the review review.appID = _appStatus[appUrl].appId; review.appName = _appStatus[appUrl].appName; review.appURL = _appStatus[appUrl].appUrl; // Incrementing Reviews Count for this app _appStatus[appUrl].reviews++; // Adding Review Object to Database review.timestamp = DateTime.Now; // Building Query to check for duplicated review var duplicatedReviewQuery = Query.EQ("permalink", review.permalink); // Checking for duplicated review before inserting it if (mongoDB.FindMatch <AppReview> (duplicatedReviewQuery, 1, 0, Consts.REVIEWS_COLLECTION).Count() == 0) { // Inserting Review into MongoDB mongoDB.Insert <AppReview> (review, Consts.REVIEWS_COLLECTION); } else { Console.WriteLine("Duplicated Review. Skipping App"); // When this happens, there are no more reviews to be parsed shouldContinueParsing = false; // Skipping this apps processing } } // Hiccup to avoid Blocking problems Console.WriteLine("Parsed Reviews: " + _appStatus[appUrl].reviews); Thread.Sleep(new Random().Next(14000, 21000)); if (!shouldContinueParsing) { break; } } } PeopleData: Console.WriteLine("\n\n => Processing People Data"); Console.WriteLine("\nSimulating Google Login Using Selenium."); using (var firefoxDriver = new FirefoxDriver()) { // Navigating to Dummy Url - One that I Know that well be asked for a login firefoxDriver.Navigate().GoToUrl("https://play.google.com/store/people/details?id=101242565951396343093"); // Reaching Login Fields var loginField = firefoxDriver.FindElementById("Email"); var passwordField = firefoxDriver.FindElementById("Passwd"); var btnSignIn = firefoxDriver.FindElementById("signIn"); // Sending Credentials to the browser loginField.SendKeys("YOUREMAIL"); passwordField.SendKeys("YOURPASSWORD"); btnSignIn.Click(); string lastPeople = "https://play.google.com/store/people/details?id=115037241907660526856"; bool shouldcontinue = false; // Processing Reviewers Data foreach (string peopleUrl in mongoDB.FindPeopleUrls()) { // Skipping until last link if (peopleUrl == lastPeople) { shouldcontinue = true; } if (!shouldcontinue) { continue; } // Navigating To the Reviewer Page firefoxDriver.Navigate().GoToUrl(peopleUrl); // Executing Get Request for the Reviewer page on Google Play string reviewerPage = firefoxDriver.PageSource; // Extracting Reviewer Data from the Page ReviewerPageData reviewerData = dataParser.ParsePeopleData(reviewerPage); // Adding Url to the model reviewerData.reviewerUrl = peopleUrl; // Inserting it to the database - If no previous record of this Reviewer is found if (!mongoDB.IsReviewerOnDatabase(peopleUrl)) { mongoDB.Insert <ReviewerPageData> (reviewerData, "ReviewersData"); } } } // End of Processing + Console Feedback Console.WriteLine("\n\n == Processing Summary =="); foreach (var status in _appStatus.Select(t => t.Value)) { // Message string cMessage = "=> App : {0} - Status {1} - Reviews : {2}"; Console.WriteLine(String.Format(cMessage, status.appName, status.status, status.reviews)); } Console.ReadLine(); }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField, bool shouldUseProxies) { // Console Feedback _logger.Warn("Crawling Search Term : [ " + searchField + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet <String> foundUrls = new HashSet <String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed foundUrls.Add(url); if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { if (foundUrls.Contains(url)) { isDonePagging = true; break; } // Checks whether the app have been already processed foundUrls.Add(url); if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
static void Main(string[] args) { // Creating needed Instances _logger = new LogWrapper(); // Loading Configuration _logger.LogMessage("Loading Configurations from App.config"); LoadConfiguration(); // Initializing Queue _logger.LogMessage("Initializing Queue"); AWSSQSHelper appsDataQueue = new AWSSQSHelper(_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Creating MongoDB Instance _logger.LogMessage("Loading MongoDB / Creating Instances"); MongoDBWrapper mongoDB = new MongoDBWrapper(); string serverAddr = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, serverAddr, 10000, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; _logger.LogMessage("Started Processing App Urls"); do { try { // Dequeueing messages from the Queue if (!appsDataQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsDataQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appDataMessage in appsDataQueue.GetDequeuedMessages()) { try { // Deserializing message var appData = AppleStoreAppModel.FromJson(appDataMessage.Body); // Checking for duplicates if (!mongoDB.IsAppOnDatabase <AppleStoreAppModel> (appData.url)) { // Recording App Data mongoDB.Insert <AppleStoreAppModel> (appData); } } catch (Exception ex) { _logger.LogMessage(ex.Message, "App Recording", BDC.BDCCommons.TLogEventLevel.Error); } finally { // Deleting the message appsDataQueue.DeleteMessage(appDataMessage); } } } catch (Exception ex) { _logger.LogMessage(ex); } } while (true); }
static void Main(string[] args) { // Loading Keen.IO Keys and Misc. from Config File _keenIOProjectID = ConfigurationManager.AppSettings["keenIOProjectID"]; _keenIOMasterKey = ConfigurationManager.AppSettings["keenIOMasterKey"]; _keenIOWriteKey = ConfigurationManager.AppSettings["keenIOWriteKey"]; _keenIOReadKey = ConfigurationManager.AppSettings["keenIOReadKey"]; _bucketName = ConfigurationManager.AppSettings["keenIOBucketName"]; // Configuring MongoDB Wrapper for connection and queries MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Keen.IO Variables var projectSettings = new ProjectSettingsProvider(_keenIOProjectID, _keenIOMasterKey, _keenIOWriteKey, _keenIOReadKey); var keenClient = new KeenClient(projectSettings); var eventsToSend = new List <AppModel>(); long totalProcessed = 0; long totalSent = 0; DateTime start = DateTime.Now; // From This point on, you can change your code to reflect your own "Reading" logic // What I've done is simply read the records from the MongoDB database and Upload them to Keen.IO // if(args.Length != 0 && args[0] == "reset") { int count = 0; foreach (var currentApp in mongoDB.FindMatch <AppModel>(Query.NE("Uploaded", true))) { mongoDB.SetUpdated(currentApp.Url, false); ++count; if ((count % 100) == 0) { Console.WriteLine("Reset update for {0}", count); } } } foreach (var currentApp in mongoDB.FindMatch <AppModel> (Query.NE("Uploaded", true))) { if (eventsToSend.Count < 1000) { eventsToSend.Add(currentApp); continue; } var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; totalSent += sent; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); eventsToSend.Clear(); } { var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); } if (totalProcessed != totalSent) { totalProcessed = 0; totalSent = 0; foreach (var currentApp in mongoDB.FindMatch <AppModel>(Query.NE("Uploaded", true))) { if (eventsToSend.Count < 1) { eventsToSend.Add(currentApp); continue; } var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; totalSent += sent; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); eventsToSend.Clear(); } { var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); } } }
static void Main(string[] args) { // Configuring Log Object Logger logger = LogManager.GetCurrentClassLogger(); // Parsing Arguments logger.Info("Checking for Arguments"); if (args == null || args.Length != 3) { logger.Fatal("Arguments Fatal", "Incorrect number of arguments received. Try passing two."); return; // Halts. } logger.Info("Reading Arguments"); // Reading actual arguments received _arguments.Add("AppsToProcess", Int32.Parse(args[0])); _arguments.Add("ReviewsPagePerApp", Int32.Parse(args[1])); _arguments.Add("AppsToSkip", Int32.Parse(args[2])); // Building MongoDB Query - This query specifies which applications you want to parse out the reviews // For more regarding MongoDB Queries, check the documentation on the project wiki page //var mongoQuery = Query.EQ ("Instalations", "1,000,000 - 5,000,000"); var mongoQuery = Query.EQ("Category", "/store/apps/category/EDUCATION"); logger.Info("Configuring MonboDB Client"); // Creating instance of Mongo Handler for the main collection MongoDBWrapper mongoClient = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoClient.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); logger.Info("Iterating over Apps"); // Creating Play Store Parser PlayStoreParser parser = new PlayStoreParser(); // Iterating over Query Results for the App Ids foreach (var appRecord in mongoClient.FindMatch <AppModel>(mongoQuery, _arguments["AppsToProcess"], _arguments["AppsToSkip"])) { // Extracting app ID from URL string appId = appRecord.Url.Replace(Consts.PLAY_STORE_PREFIX, String.Empty); // Console Feedback logger.Info("Processing App [ " + appRecord.Name + " ] "); bool shouldSkipApp = false; // Iterating over Review Pages up to the max received as argument for (int currentPage = 1; currentPage <= _arguments["ReviewsPagePerApp"]; currentPage++) { // Checking for the need to skip this app in case of duplicated review if (shouldSkipApp) { break; } try { // Page Feedback logger.Info("\tCurrent Page: " + currentPage); // Issuing Request for Reviews string response = ReviewsWrapper.GetAppReviews(appId, currentPage); // Checking for Blocking Situation if (String.IsNullOrEmpty(response)) { logger.Info("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 Minutes Thread.Sleep(10 * 60 * 1000); } // Checking for "No Reviews" app if (response.Length < 50) { logger.Info("No Reviews for this app. Skipping"); break; } // Normalizing Response to Proper HTML response = ReviewsWrapper.NormalizeResponse(response); // Iterating over Parsed Reviews foreach (var review in parser.ParseReviews(response)) { // Adding App Data to the review review.appID = appId; review.appName = appRecord.Name; review.appURL = appRecord.Url; // Adding processing timestamp to the model review.timestamp = DateTime.Now; // Building Query to check for duplicated review var duplicatedReviewQuery = Query.EQ("permalink", review.permalink); // Checking for duplicated review before inserting it if (mongoClient.FindMatch <AppReview>(duplicatedReviewQuery, 1, 0, Consts.REVIEWS_COLLECTION).Count() == 0) { // Inserting Review into MongoDB mongoClient.Insert <AppReview>(review, Consts.REVIEWS_COLLECTION); } else { logger.Info("Duplicated Review", "Review already parsed. Skipping App"); //shouldSkipApp = true; //break; } } } catch (Exception ex) { logger.Error(ex); } } } }
static void Main (string[] args) { // Loading Configuration LogSetup.InitializeLog ("Apple_Store_Recorder.log", "info"); _logger = LogManager.GetCurrentClassLogger (); // Loading Config _logger.Info ("Loading Configurations from App.config"); LoadConfiguration (); // Initializing Queue _logger.Info ("Initializing Queue"); AWSSQSHelper appsDataQueue = new AWSSQSHelper (_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Creating MongoDB Instance _logger.Info ("Loading MongoDB / Creating Instances"); MongoDBWrapper mongoDB = new MongoDBWrapper (); string serverAddr = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, serverAddr, 10000, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; // Buffer of Messages to be recorder List<AppleStoreAppModel> recordsBuffer = new List<AppleStoreAppModel> (); List<Message> messagesBuffer = new List<Message> (); // Insert Batch Size int batchSize = 1000; _logger.Info ("Started Recording App Data"); do { try { // Dequeueing messages from the Queue if (!appsDataQueue.DeQueueMessages ()) { Thread.Sleep (_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsDataQueue.AnyMessageReceived ()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32 (Math.Pow (2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine ("Fallback (seconds) => " + waitTime); Thread.Sleep (waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appDataMessage in appsDataQueue.GetDequeuedMessages ()) { try { // Deserializing message var appData = AppleStoreAppModel.FromJson (appDataMessage.Body); // Dumping "Url" to "_id" appData._id = appData.url; // Adding it to the buffer of records to be recorded recordsBuffer.Add (appData); // Adding message to the buffer of messages to be deleted messagesBuffer.Add (appDataMessage); // Is it time to batch insert ? if ((recordsBuffer.Count % batchSize) == 0) { // Batch Insertion mongoDB.BatchInsert<AppleStoreAppModel> (recordsBuffer); // Logging Feedback _logger.Info ("\tApps Recorded : " + recordsBuffer.Count); // Deleting Messages messagesBuffer.ForEach ( (msg) => appsDataQueue.DeleteMessage (msg)); _logger.Info ("\tMessages Deleted: " + messagesBuffer.Count); // Clearing Buffers recordsBuffer.Clear (); messagesBuffer.Clear (); } } catch (Exception ex) { _logger.Error (ex); } finally { // Deleting the message appsDataQueue.DeleteMessage (appDataMessage); } } } catch (Exception ex) { _logger.Error (ex); } } while (true); }
public void Dispose() { _mongoDB = null; }
/// *** READ THIS BEFORE YOU START. *** /// *** I MEAN IT, PLEASE, READ IT *** /// /// This exporting helper will download ALL THE APPS found on the database, and /// dump it to a CSV file (with headers). /// /// Note that, since the database is Hosted on AWS, i will PAY (for the internet outbound traffic) if you execute a full database export, /// so, if you are going to execute a full export, please, get in touch with me before running this project, or send me a donation /// via paypal on [email protected] /// /// Also, be nice with the database. /// /// ** END OF WARNING *** static void Main(string[] args) { // Logs Counter int processedApps = 0; // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; // Overriding LogWriter Event LogWriter.LogEvent += LogWriter_LogEvent; LogWriter.Info("Checking Arguments"); // Periodic Log Timer Timer loggingThread = new Timer((TimerCallback) => { LogWriter.Info ("Processed Apps: " + processedApps); }, null, 10000, 10000); // Validating Arguments if (!ValidateArgs (args)) { LogWriter.Fatal ("Invalid Args", "Args must have 1 element"); return; } LogWriter.Info("Checking Write Permissions on output Path"); // Validating Write Permissions on output path if (!ValidateFilePermissions (args[0])) { LogWriter.Fatal("Insuficient Permissions", "Cannot write on path : " + args[0]); return; } // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Opening Output Stream using (StreamWriter sWriter = new StreamWriter (args[0], true, Encoding.GetEncoding("ISO-8859-1"))) { // Auto Flush Content sWriter.AutoFlush = true; // Writing Headers String headersLine = "_id,Url,Name,Developer,IsTopDeveloper,DeveloperURL,PublicationDate," + "Category,IsFree,Price,Reviewers,CoverImgUrl,Description,Score.Total,Score.Count,Score.FiveStars," + "Score.FourStars,Score.ThreeStars,Score.TwoStars,Score.OneStars,LastUpdateDate" + "AppSize,Instalations,CurrentVersion,MinimumOSVersion,ContentRating,HaveInAppPurchases,DeveloperEmail,DeveloperWebsite,DeveloperPrivacyPolicy"; sWriter.WriteLine (headersLine); // Reading all apps from the database foreach (AppModel app in mongoDB.FindAll<AppModel>()) { try { // Writing line to File sWriter.WriteLine (app.ToString ()); processedApps++; } catch (Exception ex) { LogWriter.Error (ex); } } } // Logging end of the Process LogWriter.Info ("Finished Exporting Database"); // Removing Event LogWriter.LogEvent -= LogWriter_LogEvent; }
public void ProcessRequest(HttpContext context) { context.Response.ContentType = "text/plain"; try { foreach (String key in context.Request.QueryString.AllKeys) { switch (key.ToLower()) { case "localnoid": _localNoID = context.Request.QueryString[key]; break; case "fieldname": _confirmFieldName = context.Request.QueryString[key]; break; case "confirmreponse": _confirmReponse = context.Request.QueryString[key]; break; case "computername": _computerName = context.Request.QueryString[key]; break; case "clinicarea": _clinicArea = context.Request.QueryString[key]; break; } } MongoDBWrapper dbwrapper = new MongoDBWrapper(NoIDMongoDBAddress, SparkMongoDBAddress); FhirClient client = new FhirClient(sparkEndpointAddress); string sparkReference = dbwrapper.GetSparkID(_localNoID); string sparkAddress = sparkEndpointAddress.ToString() + "/Patient/" + sparkReference; Patient pendingPatient = (Patient)client.Get(sparkAddress); if (pendingPatient != null) { if (_confirmFieldName == "birthdate") { if (pendingPatient.BirthDate != null && _confirmReponse == pendingPatient.BirthDate) { SessionQueue seq = Utilities.PatientToSessionQueue(pendingPatient, sparkReference, _localNoID, "return", "pending"); seq.SubmitDate = DateTime.UtcNow; seq._id = StringUtilities.SHA256(DomainName + Guid.NewGuid().ToString() + NodeSalt); seq.SessionComputerName = _computerName; seq.ClinicArea = _clinicArea; dbwrapper.AddPendingPatient(seq); context.Response.Write("yes"); } else { context.Response.Write("no"); } } else if (_confirmFieldName == "lastname") { //TODO: implement lastname, use metaphone or just accept exact matches? context.Response.Write("Error occurred. " + _confirmFieldName + " is not implemented yet!"); } else if (_confirmFieldName == "firstname") { //TODO: implement firstname, use root or just accept exact matches? context.Response.Write("Error occurred. " + _confirmFieldName + " is not implemented yet!"); } else if (_confirmFieldName == "failedchallenge") { SessionQueue seq = Utilities.PatientToSessionQueue(pendingPatient, sparkReference, _localNoID, "return**", "pending"); seq.SubmitDate = DateTime.UtcNow; seq._id = StringUtilities.SHA256(DomainName + Guid.NewGuid().ToString() + NodeSalt); seq.SessionComputerName = _computerName; seq.ClinicArea = _clinicArea; dbwrapper.AddPendingPatient(seq); context.Response.Write("yes"); } } } catch (Exception ex) { context.Response.Write("no. Error occured for LocalNoID = " + _localNoID + ". UpdatePendingStatus::ProcessRequest: " + ex.Message); } context.Response.End(); }
static void Main(string[] args) { // Configuring Log Object Logger logger = LogManager.GetCurrentClassLogger(); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // MongoDB instance Creation logger.Info("Configuring MonboDB Client"); // Creating instance of Mongo Handler for the main collection MongoDBWrapper mongoClient = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoClient.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); logger.Info("Iterating over Apps"); // Creating Play Store Parser PlayStoreParser parser = new PlayStoreParser(); // App Model AppModel appRecord; // Control Variable bool noError = true; // Finding all the "Apps" that didn't have the reviews visited yet while ((appRecord = mongoClient.FindAndModifyReviews()) != null) { // Extracting app ID from URL string appId = appRecord.Url.Replace(Consts.PLAY_STORE_PREFIX, String.Empty); // Console Feedback logger.Info("Processing App [ " + appRecord.Name + " ] "); try { // Console Feedback Console.Write("Reviews from : " + appRecord.Name); // Issuing Request for Reviews string response = ReviewsWrapper.GetAppReviews(appId, 1, isUsingProxies); // Checking for Blocking Situation if (String.IsNullOrEmpty(response)) { logger.Info("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 seconds Thread.Sleep(TimeSpan.FromSeconds(10)); } // Checking for "No Reviews" app if (response.Length < 50) { logger.Info("No Reviews for this app. Skipping"); Console.Write(" - No Reviews Found\n"); continue; } // Normalizing Response to Proper HTML response = ReviewsWrapper.NormalizeResponse(response); // List of Reviews List <AppReview> reviews = new List <AppReview> (); // Iterating over Parsed Reviews foreach (var review in parser.ParseReviews(response)) { // Adding App Data to the review review.appID = appId; review.appName = appRecord.Name; review.appURL = appRecord.Url; // Capture Timestamp to the model review.timestamp = DateTime.Now; // Adding reviews to the current list reviews.Add(review); } // Any Review Found ? if (reviews.Count > 0) { Console.Write(" - " + reviews.Count + " Reviews Found\n"); // Checking if there was any previous list of reviews if (appRecord.Reviews == null) { appRecord.Reviews = reviews; } else // Previous List found - Appending only the new ones { foreach (var review in reviews) { if (!appRecord.Reviews.Any(t => t.permalink.Equals(review.permalink))) { appRecord.Reviews.Add(review); } } } } } catch (Exception ex) { logger.Error(ex); Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Error : " + ex.Message); Console.ForegroundColor = ConsoleColor.White; noError = false; } finally { // Toggling back the "ReviewsStatus" attribute from the model if (noError) { appRecord.ReviewsStatus = "Visited"; mongoClient.SaveRecord <AppModel> (appRecord); } else // "Error" status { appRecord.ReviewsStatus = "Error"; mongoClient.SaveRecord <AppModel> (appRecord); } } } }
private static int SendEventsToKeep(Keen.Core.KeenClient keenClient, List<AppModel> eventsToSend, MongoDBWrapper mongoDB) { try { // Adding Event to Keen.IO keenClient.AddEvents("PlayStore2014", eventsToSend); // Incrementing Counter _appsCounter += eventsToSend.Count; // Console feedback Every 100 Processed Apps if (_appsCounter % 100 == 0) { Console.WriteLine("Uploaded : " + _appsCounter); } foreach (var e in eventsToSend) { mongoDB.SetUpdated(e.Url); } return eventsToSend.Count; } catch (Exception ex) { Console.WriteLine("\n\t" + ex.Message); } return 0; }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a Mongo "QUEUE" collection /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField, bool shouldUseProxies) { // Console Feedback _logger.Warn ("Crawling Search Term : [ " + searchField + " ]"); // Hashset of urls used to keep track of what's been parsed already HashSet<String> foundUrls = new HashSet<String> (); // Control variable to avoid "Loop" on pagging bool isDonePagging = false; // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex (@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex ("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; // Checking for the need to use "HTTP Proxies" if (shouldUseProxies) { server.Proxy = ProxiesLoader.GetWebProxy (); } // Executing Initial Request response = server.Post (String.Format (Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed foundUrls.Add (url); if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match (response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace (":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, pagToken); // Executing request for values response = server.Post (String.Format (Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { _logger.Error ("Http Error" + " - Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { if (foundUrls.Contains (url)) { isDonePagging = true; break; } // Checks whether the app have been already processed foundUrls.Add (url); if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Than, queue it :) mongoDB.AddToQueue (url); Thread.Sleep (250); // Hiccup } } // Incrementing Paging Multiplier currentMultiplier++; } while (!isDonePagging && parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
static void Main (string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.LogEvent += LogWriter_LogEvent; // Parsing Arguments LogWriter.Info ("Checking for Arguments"); if (args == null || args.Length != 3) { LogWriter.Fatal ("Arguments Fatal", "Incorrect number of arguments received. Try passing two."); return; // Halts. } LogWriter.Info ("Reading Arguments"); // Reading actual arguments received _arguments.Add ("AppsToProcess", Int32.Parse (args[0])); _arguments.Add ("ReviewsPagePerApp", Int32.Parse (args[1])); _arguments.Add ("AppsToSkip", Int32.Parse (args[2])); // Building MongoDB Query - This query specifies which applications you want to parse out the reviews // For more regarding MongoDB Queries, check the documentation on the project wiki page //var mongoQuery = Query.EQ ("Instalations", "1,000,000 - 5,000,000"); var mongoQuery = Query.EQ ("Category", "/store/apps/category/EDUCATION"); LogWriter.Info ("Configuring MonboDB Client"); // Creating instance of Mongo Handler for the main collection MongoDBWrapper mongoClient = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoClient.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); LogWriter.Info ("Iterating over Apps"); // App URL Prefix (must be removed in order to obtain the app ID) string playStorePrefix = "https://play.google.com/store/apps/details?id="; // Creating Play Store Parser PlayStoreParser parser = new PlayStoreParser (); // Iterating over Query Results for the App Ids foreach (var appRecord in mongoClient.FindMatch<AppModel>(mongoQuery, _arguments["AppsToProcess"], _arguments["AppsToSkip"])) { // Extracting app ID from URL string appId = appRecord.Url.Replace(playStorePrefix, String.Empty); // Console Feedback LogWriter.Info("Processing App [ " + appRecord.Name + " ] "); bool shouldSkipApp = false; // Iterating over Review Pages up to the max received as argument for (int currentPage = 1; currentPage <= _arguments["ReviewsPagePerApp"]; currentPage++) { // Checking for the need to skip this app in case of duplicated review if (shouldSkipApp) break; try { // Page Feedback LogWriter.Info("\tCurrent Page: " + currentPage); // Issuing Request for Reviews string response = GetAppReviews(appId, currentPage); // Checking for Blocking Situation if (String.IsNullOrEmpty(response)) { LogWriter.Info("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 Minutes Thread.Sleep(10 * 60 * 1000); } // Checking for "No Reviews" app if (response.Length < 50) { LogWriter.Info("No Reviews for this app. Skipping"); break; } // Normalizing Response to Proper HTML response = NormalizeResponse(response); // Iterating over Parsed Reviews foreach (var review in parser.ParseReviews(response)) { // Adding App Data to the review review.appID = appId; review.appName = appRecord.Name; review.appURL = appRecord.Url; // Adding processing timestamp to the model review.timestamp = DateTime.Now; // Building Query to check for duplicated review var duplicatedReviewQuery = Query.EQ("permalink", review.permalink); // Checking for duplicated review before inserting it if (mongoClient.FindMatch<AppReview>(duplicatedReviewQuery, 1, 0, Consts.REVIEWS_COLLECTION).Count() == 0) { // Inserting Review into MongoDB mongoClient.Insert<AppReview>(review, Consts.REVIEWS_COLLECTION); } else { LogWriter.Info("Duplicated Review", "Review already parsed. Skipping App"); //shouldSkipApp = true; //break; } } } catch (Exception ex) { LogWriter.Error(ex); } } } }
static void Main(string[] args) { // Loading Keen.IO Keys and Misc. from Config File _keenIOProjectID = ConfigurationManager.AppSettings["keenIOProjectID"]; _keenIOMasterKey = ConfigurationManager.AppSettings["keenIOMasterKey"]; _keenIOWriteKey = ConfigurationManager.AppSettings["keenIOWriteKey"]; _keenIOReadKey = ConfigurationManager.AppSettings["keenIOReadKey"]; _bucketName = ConfigurationManager.AppSettings["keenIOBucketName"]; // Configuring MongoDB Wrapper for connection and queries MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Keen.IO Variables var projectSettings = new ProjectSettingsProvider (_keenIOProjectID, _keenIOMasterKey, _keenIOWriteKey, _keenIOReadKey); var keenClient = new KeenClient (projectSettings); var eventsToSend = new List<AppModel>(); long totalProcessed = 0; long totalSent = 0; DateTime start = DateTime.Now; // From This point on, you can change your code to reflect your own "Reading" logic // What I've done is simply read the records from the MongoDB database and Upload them to Keen.IO // if(args.Length != 0 && args[0] == "reset") { int count = 0; foreach (var currentApp in mongoDB.FindMatch<AppModel>(Query.NE("Uploaded", true))) { mongoDB.SetUpdated(currentApp.Url, false); ++count; if((count % 100) == 0) { Console.WriteLine("Reset update for {0}", count); } } } foreach (var currentApp in mongoDB.FindMatch<AppModel> (Query.NE ("Uploaded", true))) { if (eventsToSend.Count < 1000) { eventsToSend.Add(currentApp); continue; } var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; totalSent += sent; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); eventsToSend.Clear(); } { var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); } if(totalProcessed != totalSent) { totalProcessed = 0; totalSent = 0; foreach (var currentApp in mongoDB.FindMatch<AppModel>(Query.NE("Uploaded", true))) { if (eventsToSend.Count < 1) { eventsToSend.Add(currentApp); continue; } var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; totalSent += sent; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); eventsToSend.Clear(); } { var sent = SendEventsToKeep(keenClient, eventsToSend, mongoDB); totalProcessed += eventsToSend.Count; Console.WriteLine("processed {0} events took {1}: ({2} events per sec)", totalProcessed, DateTime.Now - start, ((double)totalProcessed) / (DateTime.Now - start).TotalSeconds); } } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore (string searchField) { // Console Feedback Console.WriteLine ("Crawling Search Term : [ " + searchField + " ]"); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Response Parser PlayStoreParser parser = new PlayStoreParser (); // Executing Web Requests using (WebRequests server = new WebRequests ()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post (Consts.CRAWL_URL, Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format (Consts.POST_DATA, (initialSkip * currentMultiplier)); // Executing request for values response = server.Post (Consts.CRAWL_URL, postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error ("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls (response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed (Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued (url))) { // Console Feedback Console.WriteLine (" . Queued App"); // Than, queue it :) mongoDB.AddToQueue (url); } else { // Console Feedback Console.WriteLine (" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound (response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
static void Main (string[] args) { // Checking for Input Parameters if (args == null || args.Length != 1) { Console.WriteLine ("Incorrect number of arguments received. Expected One"); System.Environment.Exit (-100); } // Human Readable Variable string inputFile = args[0]; // Checking if the Input file received exists if (!File.Exists (inputFile)) { Console.WriteLine (String.Format("Received input file does not exist : {0}", inputFile)); System.Environment.Exit (-101); } // App Status _appStatus = new Dictionary<String, AppStatusModel> (); // Creating Instance of Database Manager MongoDBWrapper mongoDB = new MongoDBWrapper (); string fullServerAddress = String.Join (":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase (Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Parser PlayStoreParser dataParser = new PlayStoreParser (); goto PeopleData; using (WebRequests httpClient = new WebRequests ()) { // Minor Configuration of the Http Client - Ensures that the requests response will be in english // By doing so, we have no problems parsing the dates to their proper formats httpClient.Headers.Add (Consts.ACCEPT_LANGUAGE); httpClient.Host = Consts.HOST; httpClient.Encoding = "utf-8"; httpClient.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Iterating over File Lines (App Urls) - To Extract Data, Not The Reviews Yet. foreach (string appUrl in File.ReadAllLines (inputFile)) { // Logging Progress Console.WriteLine ("\n => Processing App : " + appUrl); // Executing Http Get Request for the Apps's Data - With max of 5 Retries String appDataResponse = String.Empty; int currentRetry = 0; do { // Http Get appDataResponse = httpClient.Get (appUrl); } while (String.IsNullOrWhiteSpace(appDataResponse) || ++currentRetry <= _maxRetries); // Sanity Check if (String.IsNullOrWhiteSpace (appDataResponse)) { Console.WriteLine ("\t\t.Error - Failed to find page of app : " + appUrl + ". Skipping it"); continue; } Console.WriteLine("\t\t.Page Found. Firing Parser"); // Parsing App Data AppModel appData = dataParser.ParseAppPage (appDataResponse, appUrl); // Checking If this app is on the database already if (mongoDB.AppProcessed (appUrl)) { Console.WriteLine ("\t\t.Previous Version of App Found. Updating It"); mongoDB.UpdateRecord (appData, "Url", appData.Url); // Updating App Status _appStatus.Add ( appData.Url, new AppStatusModel () { appId = appData.Url.Replace (Consts.PLAY_STORE_PREFIX, String.Empty), appUrl = appData.Url, appName = appData.Name, status = "Updated" } ); } else { Console.WriteLine ("\t\t.No Previous Version of the App Found. Adding to Database"); mongoDB.Insert<AppModel> (appData); // Updating App Status _appStatus.Add ( appData.Url, new AppStatusModel () { appId = appData.Url.Replace (Consts.PLAY_STORE_PREFIX, String.Empty), appUrl = appData.Url, appName = appData.Name, status = "Inserted" } ); } } } Reviews: // Next Phase: Parse Reviews of those Apps Console.WriteLine ("\n => Parsing Complete. Obtaining Reviews"); // Iterating again over app urls to parse the reviews from this app foreach (string appUrl in File.ReadAllLines (inputFile)) { // Reaching App Id string appID = _appStatus[appUrl].appId; // Reviews-Break-Parsing Flag bool shouldContinueParsing = true; // Parsing Review Pages from the apps for (int currentPage = 1; /* no stop condition */; currentPage++) { // Getting Reviews Data Bundle string reviewsData = ReviewsWrapper.GetAppReviews (appID, currentPage); // Checking for Blocking Situation if (String.IsNullOrEmpty (reviewsData)) { Console.WriteLine("Blocked by Play Store. Sleeping process for 10 minutes before retrying."); // Thread Wait for 10 Minutes Thread.Sleep (10 * 60 * 1000); } // Checking for "No Reviews" app if (reviewsData.Length < 50) { Console.WriteLine ("No Reviews left for this app. Skipping"); break; } // Normalizing Response to Proper HTML reviewsData = ReviewsWrapper.NormalizeResponse (reviewsData); // Iterating over Parsed Reviews foreach (var review in dataParser.ParseReviews (reviewsData)) { // Adding App Data to the review review.appID = _appStatus[appUrl].appId; review.appName = _appStatus[appUrl].appName; review.appURL = _appStatus[appUrl].appUrl; // Incrementing Reviews Count for this app _appStatus[appUrl].reviews++; // Adding Review Object to Database review.timestamp = DateTime.Now; // Building Query to check for duplicated review var duplicatedReviewQuery = Query.EQ ("permalink", review.permalink); // Checking for duplicated review before inserting it if (mongoDB.FindMatch<AppReview> (duplicatedReviewQuery, 1, 0, Consts.REVIEWS_COLLECTION).Count () == 0) { // Inserting Review into MongoDB mongoDB.Insert<AppReview> (review, Consts.REVIEWS_COLLECTION); } else { Console.WriteLine ("Duplicated Review. Skipping App"); // When this happens, there are no more reviews to be parsed shouldContinueParsing = false; // Skipping this apps processing } } // Hiccup to avoid Blocking problems Console.WriteLine ("Parsed Reviews: " + _appStatus[appUrl].reviews); Thread.Sleep (new Random ().Next (14000, 21000)); if (!shouldContinueParsing) { break; } } } PeopleData: Console.WriteLine ("\n\n => Processing People Data"); Console.WriteLine ("\nSimulating Google Login Using Selenium."); using (var firefoxDriver = new FirefoxDriver ()) { // Navigating to Dummy Url - One that I Know that well be asked for a login firefoxDriver.Navigate ().GoToUrl ("https://play.google.com/store/people/details?id=101242565951396343093"); // Reaching Login Fields var loginField = firefoxDriver.FindElementById ("Email"); var passwordField = firefoxDriver.FindElementById ("Passwd"); var btnSignIn = firefoxDriver.FindElementById ("signIn"); // Sending Credentials to the browser loginField.SendKeys ("YOUREMAIL"); passwordField.SendKeys ("YOURPASSWORD"); btnSignIn.Click (); string lastPeople = "https://play.google.com/store/people/details?id=115037241907660526856"; bool shouldcontinue = false; // Processing Reviewers Data foreach (string peopleUrl in mongoDB.FindPeopleUrls ()) { // Skipping until last link if (peopleUrl == lastPeople) { shouldcontinue = true; } if (!shouldcontinue) continue; // Navigating To the Reviewer Page firefoxDriver.Navigate ().GoToUrl (peopleUrl); // Executing Get Request for the Reviewer page on Google Play string reviewerPage = firefoxDriver.PageSource; // Extracting Reviewer Data from the Page ReviewerPageData reviewerData = dataParser.ParsePeopleData (reviewerPage); // Adding Url to the model reviewerData.reviewerUrl = peopleUrl; // Inserting it to the database - If no previous record of this Reviewer is found if (!mongoDB.IsReviewerOnDatabase (peopleUrl)) { mongoDB.Insert<ReviewerPageData> (reviewerData, "ReviewersData"); } } } // End of Processing + Console Feedback Console.WriteLine ("\n\n == Processing Summary =="); foreach (var status in _appStatus.Select (t => t.Value)) { // Message string cMessage = "=> App : {0} - Status {1} - Reviews : {2}"; Console.WriteLine (String.Format (cMessage, status.appName, status.status, status.reviews)); } Console.ReadLine (); }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.Info ("Worker Started"); // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests (); QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify ()) != null) { try { // Building APP URL string appUrl = Consts.APP_URL_PREFIX + app.Url; // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue (app.Url); continue; } // Vu // Check if the app does not meet criteria if (app.NotMeetCrit) { Console.WriteLine("App Not meet Criteria, Skipped."); } // Configuring server and Issuing Request server.Headers.Add (Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; string response = server.Get (appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Info ("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests (); // Inc. retry counter retryCounter++; Console.WriteLine ("Retrying:" + retryCounter); // Checking for maximum retry count double waitTime; if (retryCounter >= 7) { waitTime = TimeSpan.FromMinutes (35).TotalMilliseconds; // Removing App from the database (this the app page may have expired) mongoDB.RemoveFromQueue (app.Url); Process.Start ("PlayStoreWorker.exe"); Process.GetCurrentProcess ().Kill (); } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds; } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP Thread.Sleep (Convert.ToInt32 (waitTime)); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage (response, appUrl); // Vu // Here is where insert the app into the ProcessedApps Database. // Attemp to check for the condition base on number of instalation and rating // First split the string into the string array string[] installations; string[] separators = new string[] { " - " }; // Getting the Installation number for the current app installations = parsedApp.Instalations.Split(separators, StringSplitOptions.RemoveEmptyEntries); installations[0] = installations[0].Replace(",", ""); // replace the "," in the number of installations installations[1] = installations[1].Replace(",", ""); long install_num = 0; try { install_num = Convert.ToInt64(installations[0]); } catch (OverflowException) { Console.WriteLine("{0} is outside the range of the Int64 type."); } catch (FormatException) { Console.WriteLine("The {0} value '{1}' is not recognizable"); } bool removed = false; // Getting the rating for the current app double rating = parsedApp.Score.Total; // Getting the developer name ( company name) string developer = parsedApp.Developer; // if the installation number is less than 1000,000 // OR rating less than 3 stars // OR appName is empty // -> skip the app string appName = parsedApp.Name; if (install_num < 1000000 || rating < 3.5 || appName == "" || appName == null) { Console.WriteLine("Cannot add app <" + appName + "> -- NOT MEET CRITERIA"); // TODO: Update the NotMeetCriteria // Removing App from the database mongoDB.RemoveFromQueue(app.Url); removed = true; } // Inserting App into MONGO_COLLECTION collection // if the Insert func return false, then print a message indicates that if (ProcessingWorked && !mongoDB.Insert<AppModel>(parsedApp) && !removed) { Console.WriteLine("Cannot add app <" + appName + "> -- FAIL TO ADD TO Database"); ProcessingWorked = false; } // If processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to if (!removed) { Console.WriteLine("Inserted App : " + parsedApp.Name); mongoDB.RemoveFromQueue(app.Url); } else { Console.WriteLine("Removed App : " + parsedApp.Name); } } // Vu // TRY TO NOT DOWNLOAD THE RELATED APPS /* // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps (response)) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue (extraAppUrl); } } // Console Feedback Console.WriteLine ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); */ // Hiccup (used to minimize blocking issues) Thread.Sleep (300); } } catch (Exception ex) { LogWriter.Error (ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning LogWriter.Error (ex); } } } }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.Info ("Worker Started"); // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests (); QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify ()) != null) { try { // Building APP URL string appUrl = Consts.APP_URL_PREFIX + app.Url; // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue (app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; string response = server.Get (appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty (response) || server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Info ("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests (); // Inc. retry counter retryCounter++; Console.WriteLine ("Retrying:" + retryCounter); // Checking for maximum retry count double waitTime; if (retryCounter >= 11) { waitTime = TimeSpan.FromMinutes (35).TotalMilliseconds; // Removing App from the database (this the app page may have expired) mongoDB.RemoveFromQueue (appUrl); } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds (Math.Pow (2, retryCounter)).TotalMilliseconds; } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP Thread.Sleep (Convert.ToInt32 (waitTime)); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage (response, appUrl); // Inserting App into Mongo DB Database if (!mongoDB.Insert<AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Inserted App : " + parsedApp.Name); mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps (response)) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed (fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue (extraAppUrl); } } // Console Feedback Console.WriteLine ("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { LogWriter.Error (ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning LogWriter.Error (ex); } } } }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object LogSetup.InitializeLog("PlayStoreWorker.log", "info"); Logger logger = LogManager.GetCurrentClassLogger(); logger.Info("Worker Started"); // Control Variable (Bool - Should the process use proxies? ) bool isUsingProxies = false; // Checking for the need to use proxies if (args != null && args.Length == 1) { // Setting flag to true isUsingProxies = true; // Loading proxies from .txt received as argument String fPath = args[0]; // Sanity Check if (!File.Exists(fPath)) { logger.Fatal("Couldnt find proxies on path : " + fPath); System.Environment.Exit(-100); } // Reading Proxies from File string[] fLines = File.ReadAllLines(fPath, Encoding.GetEncoding("UTF-8")); try { // Actual Load of Proxies ProxiesLoader.Load(fLines.ToList()); } catch (Exception ex) { logger.Fatal(ex); System.Environment.Exit(-101); } } // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); /* * // populate min downloaded & max downloaded * int count = 0; * var apps = mongoDB.FindAll<AppModel>(); * foreach(var a in apps) * { * a.FillMinAndMaxInstalls(); ++count; * * if((count % 100) == 0) * { * Console.WriteLine("updated {0}", count); * } * * if (!mongoDB.UpsertKeyEq<AppModel>(a, "Url", a.Url)) * { * Console.WriteLine("UpsertKey failed"); * } * } */ // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); // Queued App Model QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = app.Url; // Sanity check of app page url if (app.Url.IndexOf("http", StringComparison.OrdinalIgnoreCase) < 0) { appUrl = Consts.APP_URL_PREFIX + app.Url; } // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to logger.Info("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.UserAgent = Consts.GITHUBURL; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; // Checking for the need to use "HTTP Proxies" if (isUsingProxies) { server.Proxy = ProxiesLoader.GetWebProxy(); } // Issuing HTTP Request string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { logger.Info("Error opening app page : " + appUrl); ProcessingWorked = false; if (isUsingProxies) { ProxiesLoader.IncrementCurrentProxy(); } // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Fallback time variable double waitTime; // Checking which "Waiting Logic" to use - If there are proxies being used, there's no need to wait too much // If there are no proxies in use, on the other hand, the process must wait more time if (isUsingProxies) { // Waits two seconds everytime waitTime = TimeSpan.FromSeconds(2).TotalMilliseconds; } else { // Increments retry counter retryCounter++; // Checking for maximum retry count if (retryCounter >= 8) { waitTime = TimeSpan.FromMinutes(20).TotalMilliseconds; } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP logger.Info("======================================================"); logger.Info("\n\tFallback : " + waitTime + " Seconds"); Thread.Sleep(Convert.ToInt32(waitTime)); // If The Status code is "ZERO" (it means 404) - App must be removed from "Queue" if (server.StatusCode == 0) { // Console Feedback logger.Info("\tApp Not Found (404) - " + app.Url); mongoDB.RemoveFromQueue(app.Url); } logger.Info("======================================================"); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Normalizing URLs if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperPrivacyPolicy)) { parsedApp.DeveloperPrivacyPolicy = parsedApp.DeveloperPrivacyPolicy.Replace("https://www.google.com/url?q=", String.Empty); } if (!String.IsNullOrWhiteSpace(parsedApp.DeveloperWebsite)) { parsedApp.DeveloperNormalizedDomain = parser.NormalizeDomainName(parsedApp.DeveloperWebsite); } List <String> relatedApps = new List <String> (); // Avoiding Exceptions caused by "No Related Apps" situations - Must be treated differently try { // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { relatedApps.Add(Consts.APP_URL_PREFIX + extraAppUrl); } // Adding "Related Apps" to Apps Model parsedApp.RelatedUrls = relatedApps.Distinct().ToArray(); } catch { logger.Info("\tNo Related Apps Found. Skipping"); } // Inserting App into Mongo DB Database if (!mongoDB.UpsertKeyEq <AppModel>(parsedApp, "Url", appUrl)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.ForegroundColor = ConsoleColor.Red; logger.Info("Inserted App : " + parsedApp.Name); Console.ForegroundColor = ConsoleColor.White; mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in relatedApps) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl; if (extraAppUrl.IndexOf("https://play.google.com/") >= 0) { fullExtraAppUrl = extraAppUrl; } else { fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; } // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback logger.Info("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { logger.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning logger.Error(ex); } } } }
static void Main(string[] args) { // Loading Configuration LogSetup.InitializeLog("Apple_Store_Recorder.log", "info"); _logger = LogManager.GetCurrentClassLogger(); // Loading Config _logger.Info("Loading Configurations from App.config"); LoadConfiguration(); // Initializing Queue _logger.Info("Initializing Queue"); AWSSQSHelper appsDataQueue = new AWSSQSHelper(_appsDataQueueName, _maxMessagesPerDequeue, _awsKey, _awsKeySecret); AWSSQSHelper backup = new AWSSQSHelper("DeadLetter", _maxMessagesPerDequeue, _awsKey, _awsKeySecret); // Creating MongoDB Instance _logger.Info("Loading MongoDB / Creating Instances"); MongoDBWrapper mongoDB = new MongoDBWrapper(); string serverAddr = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, serverAddr, 10000, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Setting Error Flag to No Error ( 0 ) System.Environment.ExitCode = 0; // Initialiazing Control Variables int fallbackWaitTime = 1; // Buffer of Messages to be recorder List <AppleStoreAppModel> recordsBuffer = new List <AppleStoreAppModel> (); List <Message> messagesBuffer = new List <Message> (); // Insert Batch Size int batchSize = 1000; _logger.Info("Started Recording App Data"); do { try { // Dequeueing messages from the Queue if (!appsDataQueue.DeQueueMessages()) { Thread.Sleep(_hiccupTime); // Hiccup continue; } // Checking for no message received, and false positives situations if (!appsDataQueue.AnyMessageReceived()) { // If no message was found, increases the wait time int waitTime; if (fallbackWaitTime <= 12) { // Exponential increase on the wait time, truncated after 12 retries waitTime = Convert.ToInt32(Math.Pow(2, fallbackWaitTime) * 1000); } else // Reseting Wait after 12 fallbacks { waitTime = 2000; fallbackWaitTime = 0; } fallbackWaitTime++; // Sleeping before next try Console.WriteLine("Fallback (seconds) => " + waitTime); Thread.Sleep(waitTime); continue; } // Reseting fallback time fallbackWaitTime = 1; // Iterating over dequeued Messages foreach (var appDataMessage in appsDataQueue.GetDequeuedMessages()) { try { // Deserializing message var appData = AppleStoreAppModel.FromJson(appDataMessage.Body); // Dumping "Url" to "_id" appData._id = appData.url; // Adding it to the buffer of records to be recorded recordsBuffer.Add(appData); // Adding message to the buffer of messages to be deleted messagesBuffer.Add(appDataMessage); // Is it time to batch insert ? if ((recordsBuffer.Count % batchSize) == 0) { // Batch Insertion mongoDB.BatchInsert <AppleStoreAppModel> (recordsBuffer); // Logging Feedback _logger.Info("\tApps Recorded : " + recordsBuffer.Count); // Deleting Messages messagesBuffer.ForEach((msg) => appsDataQueue.DeleteMessage(msg)); _logger.Info("\tMessages Deleted: " + messagesBuffer.Count); // Clearing Buffers recordsBuffer.Clear(); messagesBuffer.Clear(); } } catch (Exception ex) { _logger.Error(ex); } finally { // Deleting the message appsDataQueue.DeleteMessage(appDataMessage); backup.EnqueueMessage(appDataMessage.Body); } } } catch (Exception ex) { _logger.Error(ex); } } while (true); }
/// <summary> /// Entry point of the worker piece of the process /// Notice that you can run as many workers as you want to in order to make the crawling faster /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; LogWriter.Info("Worker Started"); // Parser PlayStoreParser parser = new PlayStoreParser(); // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Creating Instance of Web Requests Server WebRequests server = new WebRequests(); QueuedApp app; // Retry Counter (Used for exponential wait increasing logic) int retryCounter = 0; // Iterating Over MongoDB Records while no document is found to be processed while ((app = mongoDB.FindAndModify()) != null) { try { // Building APP URL string appUrl = Consts.APP_URL_PREFIX + app.Url; // Checking if this app is on the database already if (mongoDB.AppProcessed(appUrl)) { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Duplicated App, skipped."); // Delete it from the queue and continues the loop mongoDB.RemoveFromQueue(app.Url); continue; } // Configuring server and Issuing Request server.Headers.Add(Consts.ACCEPT_LANGUAGE); server.Host = Consts.HOST; server.Encoding = "utf-8"; server.EncodingDetection = WebRequests.CharsetDetection.DefaultCharset; string response = server.Get(appUrl); // Flag Indicating Success while processing and parsing this app bool ProcessingWorked = true; // Sanity Check if (String.IsNullOrEmpty(response) || server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Info("Error opening app page : " + appUrl); ProcessingWorked = false; // Renewing WebRequest Object to get rid of Cookies server = new WebRequests(); // Inc. retry counter retryCounter++; Console.WriteLine("Retrying:" + retryCounter); // Checking for maximum retry count double waitTime; if (retryCounter >= 11) { waitTime = TimeSpan.FromMinutes(35).TotalMilliseconds; // Removing App from the database (this the app page may have expired) mongoDB.RemoveFromQueue(app.Url); } else { // Calculating next wait time ( 2 ^ retryCounter seconds) waitTime = TimeSpan.FromSeconds(Math.Pow(2, retryCounter)).TotalMilliseconds; } // Hiccup to avoid google blocking connections in case of heavy traffic from the same IP Thread.Sleep(Convert.ToInt32(waitTime)); } else { // Reseting retry counter retryCounter = 0; // Parsing Useful App Data AppModel parsedApp = parser.ParseAppPage(response, appUrl); // Inserting App into Mongo DB Database if (!mongoDB.Insert <AppModel>(parsedApp)) { ProcessingWorked = false; } // If the processing failed, do not remove the app from the database, instead, keep it and flag it as not busy // so that other workers can try to process it later if (!ProcessingWorked) { mongoDB.ToggleBusyApp(app, false); } else // On the other hand, if processing worked, removes it from the database { // Console Feedback, Comment this line to disable if you want to Console.WriteLine("Inserted App : " + parsedApp.Name); mongoDB.RemoveFromQueue(app.Url); } // Counters for console feedback only int extraAppsCounter = 0, newExtraApps = 0; // Parsing "Related Apps" and "More From Developer" Apps (URLS Only) foreach (string extraAppUrl in parser.ParseExtraApps(response)) { // Incrementing counter of extra apps extraAppsCounter++; // Assembling Full app Url to check with database string fullExtraAppUrl = Consts.APP_URL_PREFIX + extraAppUrl; // Checking if the app was either processed or queued to be processed already if ((!mongoDB.AppProcessed(fullExtraAppUrl)) && (!mongoDB.IsAppOnQueue(extraAppUrl))) { // Incrementing counter of inserted apps newExtraApps++; // Adds it to the queue of apps to be processed mongoDB.AddToQueue(extraAppUrl); } } // Console Feedback Console.WriteLine("Queued " + newExtraApps + " / " + extraAppsCounter + " related apps"); } } catch (Exception ex) { LogWriter.Error(ex); } finally { try { // Toggles Busy status back to false mongoDB.ToggleBusyApp(app, false); } catch (Exception ex) { // Toggle Busy App may raise an exception in case of lack of internet connection, so, i must use this // "inner catch" to avoid it from happenning LogWriter.Error(ex); } } } }
/// *** READ THIS BEFORE YOU START. *** /// *** I MEAN IT, PLEASE, READ IT *** /// /// This exporting helper will download ALL THE APPS found on the database, and /// dump it to a CSV file (with headers). /// /// Note that, since the database is Hosted on AWS, i will PAY (for the internet outbound traffic) if you execute a full database export, /// so, if you are going to execute a full export, please, get in touch with me before running this project, or send me a donation /// via paypal on [email protected] /// /// Also, be nice with the database. /// /// ** END OF WARNING *** static void Main(string[] args) { // Logs Counter int processedApps = 0; // Configuring Log Object Threshold LogWriter.Threshold = TLogEventLevel.Information; // Overriding LogWriter Event LogWriter.LogEvent += LogWriter_LogEvent; LogWriter.Info("Checking Arguments"); // Periodic Log Timer Timer loggingThread = new Timer((TimerCallback) => { LogWriter.Info("Processed Apps: " + processedApps); }, null, 10000, 10000); // Validating Arguments if (!ValidateArgs(args)) { LogWriter.Fatal("Invalid Args", "Args must have 1 element"); return; } LogWriter.Info("Checking Write Permissions on output Path"); // Validating Write Permissions on output path if (!ValidateFilePermissions(args[0])) { LogWriter.Fatal("Insuficient Permissions", "Cannot write on path : " + args[0]); return; } // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Opening Output Stream using (StreamWriter sWriter = new StreamWriter(args[0], true, Encoding.GetEncoding("ISO-8859-1"))) { // Auto Flush Content sWriter.AutoFlush = true; // Writing Headers String headersLine = "_id,Url,ReferenceDate,Name,Developer,IsTopDeveloper,DeveloperURL,PublicationDate," + "Category,IsFree,Price,Reviewers,CoverImgUrl,Description,Score.Total,Score.Count,Score.FiveStars," + "Score.FourStars,Score.ThreeStars,Score.TwoStars,Score.OneStars,LastUpdateDate" + "AppSize,Instalations,CurrentVersion,MinimumOSVersion,ContentRating,HaveInAppPurchases,DeveloperEmail,DeveloperWebsite,DeveloperPrivacyPolicy"; sWriter.WriteLine(headersLine); // Example of MongoDB Query Construction // Queries for records which have the attribute "IsTopDeveloper" equal to "false" var mongoQuery = Query.EQ("IsTopDeveloper", false); // Reading all apps from the database // USAGE: CHANGE FindMatches to FindAll if you want to export all the records from the database foreach (AppModel app in mongoDB.FindMatch <AppModel>(mongoQuery, 10, 0)) { try { // Writing line to File sWriter.WriteLine(app.ToString()); processedApps++; } catch (Exception ex) { LogWriter.Error(ex); } } } // Logging end of the Process LogWriter.Info("Finished Exporting Database"); // Removing Event LogWriter.LogEvent -= LogWriter_LogEvent; }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField) { // Console Feedback Console.WriteLine("Crawling Search Term : [ " + searchField + " ]"); // Compiling Regular Expression used to parse the "pagToken" out of the Play Store Regex pagTokenRegex = new Regex(@"GAEi+.+\:S\:.{11}\\42", RegexOptions.Compiled); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Ensuring the database has the proper indexe mongoDB.EnsureIndex("Url"); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post(String.Format(Consts.CRAWL_URL, searchField), Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Finding pagToken from HTML var rgxMatch = pagTokenRegex.Match(response); // If there's no match, skips it if (!rgxMatch.Success) { break; } // Reading Match from Regex, and applying needed replacements string pagToken = rgxMatch.Value.Replace(":S:", "%3AS%3A").Replace("\\42", String.Empty).Replace(@"\\u003d", String.Empty); // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, pagToken); // Executing request for values response = server.Post(String.Format(Consts.CRAWL_URL, searchField), postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); Thread.Sleep(250); // Hiccup } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }
/// <summary> /// Executes a Search using the searchField as the search parameter, /// paginates / scrolls the search results to the end adding all the url of apps /// it finds to a AWS SQS queue /// </summary> /// <param name="searchField"></param> private static void CrawlStore(string searchField) { // Console Feedback Console.WriteLine("Crawling Search Term : [ " + searchField + " ]"); // HTML Response string response; // MongoDB Helper // Configuring MongoDB Wrapper MongoDBWrapper mongoDB = new MongoDBWrapper(); string fullServerAddress = String.Join(":", Consts.MONGO_SERVER, Consts.MONGO_PORT); mongoDB.ConfigureDatabase(Consts.MONGO_USER, Consts.MONGO_PASS, Consts.MONGO_AUTH_DB, fullServerAddress, Consts.MONGO_TIMEOUT, Consts.MONGO_DATABASE, Consts.MONGO_COLLECTION); // Response Parser PlayStoreParser parser = new PlayStoreParser(); // Executing Web Requests using (WebRequests server = new WebRequests()) { // Creating Request Object server.Host = Consts.HOST; // Executing Initial Request response = server.Post(Consts.CRAWL_URL, Consts.INITIAL_POST_DATA); // Parsing Links out of Html Page (Initial Request) foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Executing Requests for more Play Store Links int initialSkip = 48; int currentMultiplier = 1; int errorsCount = 0; do { // Assembling new PostData with paging values string postData = String.Format(Consts.POST_DATA, (initialSkip * currentMultiplier)); // Executing request for values response = server.Post(Consts.CRAWL_URL, postData); // Checking Server Status if (server.StatusCode != System.Net.HttpStatusCode.OK) { LogWriter.Error("Http Error", "Status Code [ " + server.StatusCode + " ]"); errorsCount++; continue; } // Parsing Links foreach (string url in parser.ParseAppUrls(response)) { // Checks whether the app have been already processed // or is queued to be processed if ((!mongoDB.AppProcessed(Consts.APP_URL_PREFIX + url)) && (!mongoDB.AppQueued(url))) { // Console Feedback Console.WriteLine(" . Queued App"); // Than, queue it :) mongoDB.AddToQueue(url); } else { // Console Feedback Console.WriteLine(" . Duplicated App. Skipped"); } } // Incrementing Paging Multiplier currentMultiplier++; } while (parser.AnyResultFound(response) && errorsCount <= Consts.MAX_REQUEST_ERRORS); } }