public async Task <FileParseResult> ParseFile(string remoteFile) { FileParseResult parseResult = new FileParseResult(); PoolSchedule poolSchedule = new PoolSchedule(); poolSchedule.Link = remoteFile; parseResult.Success = true; parseResult.Schedule = poolSchedule; string extractedText = string.Empty; try { HttpClient httpClient = new HttpClient(); HttpResponseMessage httpResponseMessage = await httpClient.GetAsync(remoteFile); if (httpResponseMessage.IsSuccessStatusCode) { Stream contentStream = await httpResponseMessage.Content.ReadAsStreamAsync(); PdfLoadedDocument loadedDocument = new PdfLoadedDocument(contentStream); DateTime modyficationDate = loadedDocument.DocumentInformation.ModificationDate; poolSchedule.ModificationDate = modyficationDate; // Loading page collections PdfLoadedPageCollection loadedPages = loadedDocument.Pages; extractedText = loadedPages[0].ExtractText(); //Close the document. loadedDocument.Close(true); List <string> pdfList = new List <string>(); pdfList.AddRange(extractedText.Split("\r\n")); string header = pdfList.Where(s => s.StartsWith("Harmonogram")).SingleOrDefault(); if (header != null) //checking if the header is compatible with the schema { int linkLenght = header.Length; int fromIndex = header.IndexOf("od"); string dates = header.Substring(fromIndex, (linkLenght - fromIndex)); string startDateString = dates.Substring(2, dates.IndexOf("do") - 2).Trim(); string endDateString = dates.Substring(dates.IndexOf("do") + 2, dates.Length - 4 - (dates.IndexOf("do") + 2)).Trim(); GetValuesFromDateString(startDateString, poolSchedule, DateType.From); GetValuesFromDateString(endDateString, poolSchedule, DateType.To); } } poolSchedule.CheckDates(); httpResponseMessage.Dispose(); httpClient.Dispose(); return(parseResult); } catch (Exception ex) { logger.LogError(ex, "Błąd podczas parsowania pliku PDF"); parseResult.Success = false; parseResult.Schedule = null; } return(parseResult); }
private async Task CheckNewSchedule() { logger.LogInformation("Check if new schedule exists."); try { var collectionName = "PoolSchedules"; var database = mongoClient.GetDatabase("SwimmingPoolTracker"); var collection = database.GetCollection <PoolSchedule>(collectionName); List <PoolSchedule> links = new List <PoolSchedule>(); string url = "https://mzuk.gliwice.pl/jednostka/kryte-plywalnie/kryta-plywalnia-olimpijczyk/"; HtmlWeb hw = new HtmlWeb(); HtmlDocument doc = hw.Load(url); HtmlNodeCollection siteLinks = doc.DocumentNode.SelectNodes(".//a[contains(@href,'niecka')]"); foreach (HtmlNode link in siteLinks) { FileParseResult parseResult = await ParseFile(link.Attributes["href"].Value); if (parseResult.Success) { PoolSchedule exist = await collection.Find(x => x.Id == parseResult.Schedule.Id).SingleOrDefaultAsync(); if (exist == null) { await collection.InsertOneAsync(parseResult.Schedule); await SendNotification("New schedule", parseResult.Schedule); } else { // MongoDB stores dates in UTC, hosting machine timezone is set to UTC+1 if (exist.ModificationDate.ToLocalTime() != parseResult.Schedule.ModificationDate) { exist.ModificationDate = parseResult.Schedule.ModificationDate; await collection.ReplaceOneAsync(d => d.Id == exist.Id, exist); await SendNotification("Changes in schedule", parseResult.Schedule); } } } } } catch (Exception ex) { logger.LogError(ex, "Error during checking new schedule"); } }