Пример #1
0
        public async Task <HttpResponseMessage> ScrapeDocument(DocumentModel model)
        {
            try
            {
                var watch = System.Diagnostics.Stopwatch.StartNew();

                var modelToReturn = new ScrapeDocumentResult()
                {
                    NoResultsKeys = new List <string>(),
                    Results       = new List <KeyValuePair <string, string> >()
                };

                var httpClient = new HttpClient();
                var data       = await httpClient.GetStreamAsync(model.Url);

                var tempFile = Path.GetTempFileName();

                using (var fs = File.OpenWrite(tempFile))
                {
                    data.CopyTo(fs);
                }

                var text = Helpers.Helpers.ExtractTextFromPdf(tempFile);

                GetInfo(text, modelToReturn);

                foreach (var k in model.Keywords)
                {
                    var line = GetTextLine(text, k);
                    if (null == line)
                    {
                        modelToReturn.NoResultsKeys.Add(k);
                        continue;
                    }

                    modelToReturn.Results.Add(new KeyValuePair <string, string>(k, line));
                }

                var list          = new JavaScriptSerializer().Serialize(modelToReturn);
                var dataFormatted = JToken.Parse(list).ToString(Formatting.Indented);

                _db.Data.Add(new DataEntity()
                {
                    CreatedOn        = DateTime.Now,
                    IdCollectionType = (int)CollectionTypeEnum.Pdf,
                    JsonObject       = dataFormatted
                });
                _db.SaveChanges();

                watch.Stop();
                var elapsedMs = watch.ElapsedMilliseconds;

                System.Diagnostics.Debug.WriteLine("Timp document scraper: " + elapsedMs);
                return(Request.CreateResponse(HttpStatusCode.OK, modelToReturn));
            }
            catch (Exception ex)
            {
                return(new HttpResponseMessage(HttpStatusCode.InternalServerError));
            }
        }
Пример #2
0
        private static void GetInfo(string text, ScrapeDocumentResult model)
        {
            var regex    = new Regex(@"[01]?\d[/-][0123]?\d[/-]\d{2}");
            var dates    = regex.Matches(text);
            var dateList = new List <DateTime>();

            foreach (var d in dates)
            {
                var parsedDate = DateTime.ParseExact(d.ToString(), "dd-MM-yy", CultureInfo.InvariantCulture);
                if (!dateList.Any(a => a.Equals(parsedDate)))
                {
                    dateList.Add(parsedDate);
                }
            }

            model.Dates = dateList.OrderBy(o => o.Date).ToList();
        }