/// <summary> /// Reads the specified stream. /// </summary> /// <param name="stream">The stream.</param> /// <returns> /// A hydrated safety data sheet. /// </returns> /// <remarks> /// The algorithm for this method loops on each page of the given PDF until it can match the section headers. /// Once it finds section 3 and copies the CAS numbers out, it will exit to avoid unnecessary processing time. /// </remarks> public SafetyDataSheet Read(Stream stream) { using var pdfReader = new PdfReader(stream); using var pdfDocument = new PdfDocument(pdfReader); var section3 = new SafetyDataSheetSection(new RegExTokenMatcher(Section3Regex), new RegExTokenMatcher(Section4Regex)); var numberOfPages = pdfDocument.GetNumberOfPages(); var pageNumber = 1; while (pageNumber <= numberOfPages && !section3.Completed) { var page = pdfDocument.GetPage(pageNumber++); var pageText = PdfTextExtractor.GetTextFromPage(page); pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText))); section3.ExtractMatchingContent(pageText); } var sectionCasNumbers = GetCasNumbersFromSection(section3).ToList(); var successfullyParsed = sectionCasNumbers.Any(); var matchedProp65Ingredients = _prop65Cache.GetProp65Ingredients(sectionCasNumbers); var result = new SafetyDataSheet { ParsedSuccessfully = successfullyParsed, Ingredients = matchedProp65Ingredients }; return(result); }
private static IEnumerable <string> GetCasNumbersFromSection(SafetyDataSheetSection section) { return(CasNumberRegex.Matches(section.Text) .Select(m => m.Value)); }