public async Task <HttpResponseMessage> ScrapeDocument(DocumentModel model) { try { var watch = System.Diagnostics.Stopwatch.StartNew(); var modelToReturn = new ScrapeDocumentResult() { NoResultsKeys = new List <string>(), Results = new List <KeyValuePair <string, string> >() }; var httpClient = new HttpClient(); var data = await httpClient.GetStreamAsync(model.Url); var tempFile = Path.GetTempFileName(); using (var fs = File.OpenWrite(tempFile)) { data.CopyTo(fs); } var text = Helpers.Helpers.ExtractTextFromPdf(tempFile); GetInfo(text, modelToReturn); foreach (var k in model.Keywords) { var line = GetTextLine(text, k); if (null == line) { modelToReturn.NoResultsKeys.Add(k); continue; } modelToReturn.Results.Add(new KeyValuePair <string, string>(k, line)); } var list = new JavaScriptSerializer().Serialize(modelToReturn); var dataFormatted = JToken.Parse(list).ToString(Formatting.Indented); _db.Data.Add(new DataEntity() { CreatedOn = DateTime.Now, IdCollectionType = (int)CollectionTypeEnum.Pdf, JsonObject = dataFormatted }); _db.SaveChanges(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; System.Diagnostics.Debug.WriteLine("Timp document scraper: " + elapsedMs); return(Request.CreateResponse(HttpStatusCode.OK, modelToReturn)); } catch (Exception ex) { return(new HttpResponseMessage(HttpStatusCode.InternalServerError)); } }
private static void GetInfo(string text, ScrapeDocumentResult model) { var regex = new Regex(@"[01]?\d[/-][0123]?\d[/-]\d{2}"); var dates = regex.Matches(text); var dateList = new List <DateTime>(); foreach (var d in dates) { var parsedDate = DateTime.ParseExact(d.ToString(), "dd-MM-yy", CultureInfo.InvariantCulture); if (!dateList.Any(a => a.Equals(parsedDate))) { dateList.Add(parsedDate); } } model.Dates = dateList.OrderBy(o => o.Date).ToList(); }