Exemplo n.º 1
0
        public IEnumerable<IRowModel> AnalyzeAllPDFs(IEnumerable<IRowModel> filesToAnalyze, string[] wordsToAnalysis, string sourceFolder,
            ValueWrapper<bool> isCancellationRequested, ReportProgessEvent reportProgress)
        {
            foreach (var file in filesToAnalyze)
            {
                if (isCancellationRequested.Value)
                {
                    return filesToAnalyze;
                }

                try
                {
                    if (file.Descriptions.Any())
                    {
                        file.Descriptions.Clear();
                    }

                    var dms = wordsToAnalysis.Select(x => new DescriptionModel {Word = x, Occurances = 0});
                    foreach (var descriptionModel in dms)
                    {
                        file.Descriptions.Add(descriptionModel);
                    }

                    var pdfReader = new PdfReader(Path.Combine(sourceFolder, file.FileName));
                    string text = string.Empty;
                    int numberOfPages = pdfReader.NumberOfPages;
                    for (int page = 1; page < numberOfPages; page++)
                    {
                        text = PdfTextExtractor.GetTextFromPage(pdfReader, page);
                        foreach (var word in file.Descriptions)
                        {
                            word.Occurances += Regex.Matches(text, word.Word).Count;
                        }
                    }
                    file.Analyzed = true;
                    pdfReader.Close();
                }
                catch (Exception)
                {
                    continue;
                }
                if (OnFileAnalyzed != null)
                {
                    OnFileAnalyzed.Invoke(file);
                }
            }

            return filesToAnalyze;
        }
Exemplo n.º 2
0
        public void DownloadFiles(IEnumerable<Uri> links, string destinationPath, ReportProgessEvent reportProgess, ValueWrapper<bool> isCancellationRequested)
        {
            var client = new WebClient();
            var linksList = links.ToList();
            int counter = 0;
            string filename;
            foreach (var link in linksList)
            {
                if (isCancellationRequested.Value)
                {
                    return;
                }
                filename = Path.GetFileName(link.LocalPath);

                reportProgess(counter++*100/linksList.Count);

                try
                {
                    client.DownloadFile(link, filename);
                    Thread.Sleep(500);
                }
                catch (Exception)
                {
                    //zakladam ze wsyzstkie pliki sie pobiora bez przypalow.
                }
                var path = Path.Combine(destinationPath, filename);
                if (File.Exists(path))
                {
                    File.Delete(path);
                }
                File.Copy(filename, path);

                if (OnFileDownloaded != null)
                {
                    OnFileDownloaded.Invoke(path);
                }
            }
        }
Exemplo n.º 3
0
        public IEnumerable<Uri> GetAllLinks(ReportProgessEvent progressAction, ValueWrapper<bool> IsCancellationRequested)
        {
            if (string.IsNullOrEmpty(SettingsService.DownloadedFilesDestination))
            {
                throw new Exception(String.Format("Należy podać poprawną lokalizację!"));
            }
            if (!Directory.Exists(SettingsService.DownloadedFilesDestination))
            {
                throw new Exception(string.Format("Podana lokalizacja {0} nie istnieje!",
                                                  SettingsService.DownloadedFilesDestination));
            }

            int allLinksFromTableOfContentCount;
            progressAction(5);
            var allLinksFromTableOfContent = GetAllLinksFromTableOfContent(out allLinksFromTableOfContentCount);
            var allPdfUrls = new List<Uri>();
            int counter = 0;

            foreach (var pageUrl in allLinksFromTableOfContent)
            {
                if (IsCancellationRequested.Value)
                {
                    return null;
                }
                counter++;
                progressAction(counter * 100 / allLinksFromTableOfContentCount + 5);

                var oReq = (HttpWebRequest)WebRequest.Create(pageUrl);
                oReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
                WebResponse resp;

                try
                {
                    Console.WriteLine("prev resp");
                    resp = oReq.GetResponse();
                    Console.WriteLine("after resp");
                }
                catch (Exception)
                {
                    //SaveMsgToFile(counter.ToString());
                    continue;
                }

                Thread.Sleep(60);

                // WebPage result;
                if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase))
                {
                    var doc = new HtmlDocument();
                    try
                    {
                        doc.Load(resp.GetResponseStream()); // The HtmlAgilityPack
                    }
                    catch (System.Net.WebException ex)
                    {
                        //SaveMsgToFile(counter.ToString());
                        continue;
                        //result = new WebPage.Error() { Url = url, Exception = ex };
                    }
                    catch (Exception ex)
                    {
                        //SaveMsgToFile(counter.ToString());
                        continue;
                        //ex.Data.Add("Url", pageURL);    // Annotate the exception with the Url
                        //throw;
                    }

                    foreach (HtmlNode link in doc.DocumentNode.SelectNodes(@"//a[@href]"))
                    {
                        HtmlAttribute att = link.Attributes["href"];
                        if (att == null) continue;
                        string href = att.Value;
                        if (href.StartsWith("javascript", StringComparison.InvariantCultureIgnoreCase))
                            continue; // ignore javascript on buttons using a tags

                        Uri urlNext = new Uri(href, UriKind.RelativeOrAbsolute);

                        string extension = Path.GetExtension(urlNext.ToString());
                        if (extension == ".pdf")
                        {
                            if (!urlNext.IsAbsoluteUri)
                            {
                                Uri root = new Uri("http://www.stat.gov.pl/");//to zmienic na wartosc z settingsow.
                                urlNext = new Uri(root, urlNext);
                            }
                            allPdfUrls.Add(urlNext);
                        }
                    }
                    InitData.SerializePrevLinks(allPdfUrls);
                }
            }

            InitData.SerializePrevLinks(allPdfUrls);

            return allPdfUrls;
        }