public ExtractionResponse RunExtraction() { List<InputFileItem> inputItemCollection; var fileReader = new InputFileReader(_csvDelimiter); try { inputItemCollection = fileReader.GetInputItemCollection(Request.InputFilePath); } catch (Exception ex) { Response.GeneralException = ex.GetInnerMostException(); return Response; } var successCnt = 0; var markupAggregator = new MarkupAggregator( HttpAgent, VerboseLogger, _hrefRegex, _invalidSiteLinkPatterns); var emailExtractor = new EmailExtractor(_emailRegex); var fileWriter = new OutputFileWriter(_csvDelimiter, Request.OutputDirectory); foreach (var inputItem in inputItemCollection) { VerboseLogger.Log($"Extracting from {inputItem.SiteUrl} url..."); try { ProcessInputFileItem(inputItem, markupAggregator, emailExtractor, fileWriter); VerboseLogger.Log($"Extraction completed for {inputItem.SiteUrl} url..."); successCnt++; } catch (Exception ex) { var eEx = new ExtractionException("", ex) { InputUrl = inputItem.SiteUrl }; VerboseLogger.Log("Extraction failed. Moving to next url..."); Response.ExtractionExceptions = Response.ExtractionExceptions ?? new List<ExtractionException>(); Response.ExtractionExceptions.Add(eEx); } } Response.SuccessfulExtractions = successCnt; return Response; }
private void ProcessInputFileItem(InputFileItem inputItem, MarkupAggregator markupAggregator, EmailExtractor emailExtractor, OutputFileWriter fileWriter) { var aggregatedSiteMarkup = markupAggregator.AggregateMarkupFromRootUrl(inputItem.SiteUrl); VerboseLogger.Log($"...collected #{aggregatedSiteMarkup.Count} unique pages"); var extractResult = emailExtractor.ExtractEmailsFromMarkup(inputItem.SiteUrl, aggregatedSiteMarkup); VerboseLogger.Log($"...identified #{extractResult.Emails.Count} unique emails " + $"and has mailto tag = {extractResult.HasMailto}"); fileWriter.WriteExtractResult(extractResult); }