示例#1
0
        /// <summary>
        /// Reads the specified stream.
        /// </summary>
        /// <param name="stream">The stream.</param>
        /// <returns>
        /// A hydrated safety data sheet.
        /// </returns>
        /// <remarks>
        /// The algorithm for this method loops on each page of the given PDF until it can match the section headers.
        /// Once it finds section 3 and copies the CAS numbers out, it will exit to avoid unnecessary processing time.
        /// </remarks>
        public SafetyDataSheet Read(Stream stream)
        {
            using var pdfReader   = new PdfReader(stream);
            using var pdfDocument = new PdfDocument(pdfReader);

            var section3 = new SafetyDataSheetSection(new RegExTokenMatcher(Section3Regex), new RegExTokenMatcher(Section4Regex));

            var numberOfPages = pdfDocument.GetNumberOfPages();
            var pageNumber    = 1;

            while (pageNumber <= numberOfPages && !section3.Completed)
            {
                var page     = pdfDocument.GetPage(pageNumber++);
                var pageText = PdfTextExtractor.GetTextFromPage(page);

                pageText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(pageText)));
                section3.ExtractMatchingContent(pageText);
            }

            var sectionCasNumbers        = GetCasNumbersFromSection(section3).ToList();
            var successfullyParsed       = sectionCasNumbers.Any();
            var matchedProp65Ingredients = _prop65Cache.GetProp65Ingredients(sectionCasNumbers);

            var result = new SafetyDataSheet
            {
                ParsedSuccessfully = successfullyParsed,
                Ingredients        = matchedProp65Ingredients
            };

            return(result);
        }
示例#2
0
 private static IEnumerable <string> GetCasNumbersFromSection(SafetyDataSheetSection section)
 {
     return(CasNumberRegex.Matches(section.Text)
            .Select(m => m.Value));
 }