public IEnumerable<IRowModel> AnalyzeAllPDFs(IEnumerable<IRowModel> filesToAnalyze, string[] wordsToAnalysis, string sourceFolder, ValueWrapper<bool> isCancellationRequested, ReportProgessEvent reportProgress) { foreach (var file in filesToAnalyze) { if (isCancellationRequested.Value) { return filesToAnalyze; } try { if (file.Descriptions.Any()) { file.Descriptions.Clear(); } var dms = wordsToAnalysis.Select(x => new DescriptionModel {Word = x, Occurances = 0}); foreach (var descriptionModel in dms) { file.Descriptions.Add(descriptionModel); } var pdfReader = new PdfReader(Path.Combine(sourceFolder, file.FileName)); string text = string.Empty; int numberOfPages = pdfReader.NumberOfPages; for (int page = 1; page < numberOfPages; page++) { text = PdfTextExtractor.GetTextFromPage(pdfReader, page); foreach (var word in file.Descriptions) { word.Occurances += Regex.Matches(text, word.Word).Count; } } file.Analyzed = true; pdfReader.Close(); } catch (Exception) { continue; } if (OnFileAnalyzed != null) { OnFileAnalyzed.Invoke(file); } } return filesToAnalyze; }
public void DownloadFiles(IEnumerable<Uri> links, string destinationPath, ReportProgessEvent reportProgess, ValueWrapper<bool> isCancellationRequested) { var client = new WebClient(); var linksList = links.ToList(); int counter = 0; string filename; foreach (var link in linksList) { if (isCancellationRequested.Value) { return; } filename = Path.GetFileName(link.LocalPath); reportProgess(counter++*100/linksList.Count); try { client.DownloadFile(link, filename); Thread.Sleep(500); } catch (Exception) { //zakladam ze wsyzstkie pliki sie pobiora bez przypalow. } var path = Path.Combine(destinationPath, filename); if (File.Exists(path)) { File.Delete(path); } File.Copy(filename, path); if (OnFileDownloaded != null) { OnFileDownloaded.Invoke(path); } } }
public IEnumerable<Uri> GetAllLinks(ReportProgessEvent progressAction, ValueWrapper<bool> IsCancellationRequested) { if (string.IsNullOrEmpty(SettingsService.DownloadedFilesDestination)) { throw new Exception(String.Format("Należy podać poprawną lokalizację!")); } if (!Directory.Exists(SettingsService.DownloadedFilesDestination)) { throw new Exception(string.Format("Podana lokalizacja {0} nie istnieje!", SettingsService.DownloadedFilesDestination)); } int allLinksFromTableOfContentCount; progressAction(5); var allLinksFromTableOfContent = GetAllLinksFromTableOfContent(out allLinksFromTableOfContentCount); var allPdfUrls = new List<Uri>(); int counter = 0; foreach (var pageUrl in allLinksFromTableOfContent) { if (IsCancellationRequested.Value) { return null; } counter++; progressAction(counter * 100 / allLinksFromTableOfContentCount + 5); var oReq = (HttpWebRequest)WebRequest.Create(pageUrl); oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5"; WebResponse resp; try { Console.WriteLine("prev resp"); resp = oReq.GetResponse(); Console.WriteLine("after resp"); } catch (Exception) { //SaveMsgToFile(counter.ToString()); continue; } Thread.Sleep(60); // WebPage result; if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { var doc = new HtmlDocument(); try { doc.Load(resp.GetResponseStream()); // The HtmlAgilityPack } catch (System.Net.WebException ex) { //SaveMsgToFile(counter.ToString()); continue; //result = new WebPage.Error() { Url = url, Exception = ex }; } catch (Exception ex) { //SaveMsgToFile(counter.ToString()); continue; //ex.Data.Add("Url", pageURL); // Annotate the exception with the Url //throw; } foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]")) { HtmlAttribute att = link.Attributes["href"]; if (att == null) continue; string href = att.Value; if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase)) continue; // ignore javascript on buttons using a tags Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute); string extension = Path.GetExtension(urlNext.ToString()); if (extension == ".pdf") { if (!urlNext.IsAbsoluteUri) { Uri root = new Uri("http://www.stat.gov.pl/");//to zmienic na wartosc z settingsow. urlNext = new Uri(root, urlNext); } allPdfUrls.Add(urlNext); } } InitData.SerializePrevLinks(allPdfUrls); } } InitData.SerializePrevLinks(allPdfUrls); return allPdfUrls; }